lundi 2 mars 2020

How to prevent appending duplicate string values in array?

kwdArr = [
    u'"Sin Tax"' -> might appear here
]

tryKWDS = [
    u'"Sin Tax" "Cigarette"', 
    u'"Sin Tax" "Cigarettes"', 
    u'"Sin Tax" "E-cigarette"', 
    u'"Sin Tax" "E-cigarettes"', 
    u'"Sin Tax" "Heated Tobacco Product"', 
    u'"Sin Tax" "Heated Tobacco Products"',
    u'"Sin Tax"' -> might appear here
]

How to prevent appending other sin tax with keywords in my kwdArr, since a single word of ("Sin Tax") appear in the array ? and vice-versa, all of "Sin Tax" with other words should append if there's no single word ("Sin Tax") in two arrays?

def removeDuplicates(seen_words):
    tryDuplicateArr = []
    for x in kwdArr:
        tryDuplicateArr.append(x.replace('"', '').strip('\"'))
    for words in tryDuplicateArr:
        words = words.replace('"', '').strip('\"')
        seen_words |= set(words.split())

def removeDupliWords(wordsArr):
    seen_words = set()
    for words in wordsArr:
        rawWords = words
        removeDuplicates(seen_words)
        words = words.replace('"', '').strip('\"')
        print(words)
        seen = False
        for word in words.split():
            if word in seen_words:
                seen = True
                continue
        print(seen)
        if not seen:
            checkRawWords = rawWords.replace('"', '').strip('\"')
            result = checkRawWords.endswith("Corp")
            if result == True:
                rawWords = re.sub('Corp$', 'Corporation', checkRawWords)
                rawWords = '"{}"'.format(rawWords)
            kwdArr.append(u'{}'.format(rawWords))

tryKWDS.sort(key=lambda x: len(x.split()))
removeDupliWords(tryKWDS)
def removeDuplicates(seen_words):
    tryDuplicateArr = []
    for x in kwdArr:
        tryDuplicateArr.append(x.replace('"', '').strip('\"'))
    for words in tryDuplicateArr:
        words = words.replace('"', '').strip('\"')
        seen_words |= set(words.split())

def removeDupliWords(wordsArr):
    seen_words = set()
    for words in wordsArr:
        rawWords = words
        removeDuplicates(seen_words)
        words = words.replace('"', '').strip('\"')
        print(words)
        seen = False
        for word in words.split():
            if word in seen_words:
                seen = True
                continue
        print(seen)
        if not seen:
            checkRawWords = rawWords.replace('"', '').strip('\"')
            result = checkRawWords.endswith("Corp")
            if result == True:
                rawWords = re.sub('Corp$', 'Corporation', checkRawWords)
                rawWords = '"{}"'.format(rawWords)
            kwdArr.append(u'{}'.format(rawWords))

tryKWDS.sort(key=lambda x: len(x.split()))
removeDupliWords(tryKWDS)
print(kwdArr)

This is my code so far, however the First ("sin tax" "cigarette") was appended if Sin Tax is not present in tryKWDS & and kwdArr but this is working if a single word of ("Sin Tax") is present?

Aucun commentaire:

Enregistrer un commentaire