I'm working on a method that takes a streaming tweets then apply some preprocess techniques and at the end it classifies the tweets into emotions (happy, sad, scared, angry and surprised).
My method is so simple yet it doesn't give the correct results. When I print the counter of an emotion it prints incorrect numbers for instance (24)? even though it's impossible of the number to be larger then the number of tokens!
Also, the final emotion of the a tweet is always surprise? even tho happy may has the larger count!
I've tried to test the method in separate file and on a string and it works fine. but with the streaming tweets no :(
This is my code:
import re
import sys
import itertools
import csv
from nltk.tokenize import TweetTokenizer
from pubnub.callbacks import SubscribeCallback
from pubnub.enums import PNStatusCategory
from pubnub.pnconfiguration import PNConfiguration
from pubnub.pubnub import PubNub
from nltk.stem.isri import ISRIStemmer
from collections import defaultdict
def emotionAnalysis (tweetText):
# tweetText = "I was happy today but now I'm very sad"
text = ""
tknzr = TweetTokenizer()
text = tknzr.tokenize(tweetText)
columns = defaultdict(list)
with open('Dateset.csv') as f:
reader = csv.DictReader(f)
for row in reader:
for (k,v) in row.items():
columns[k].append(v)
happy_array = list(columns['Happy'])
happy_array.sort()
sad_array = list(columns['Sad'])
sad_array.sort()
angry_array = list(columns['Angry'])
angry_array.sort()
scared_array = list(columns['Scared'])
scared_array.sort()
surprised_array = list(columns['Surprised'])
surprised_array.sort()
happy_counter = 0
sad_counter = 0
angry_counter = 0
scared_counter = 0
surprised_counter = 0
for index in text:
index = ''.join([i for i in index if not i.isdigit()]) # Remove digitss
index = re.sub(r"http\S+", "",index) # Remove links
index = re.sub(r'[^\w\s]','', index) # Remove punctuation
if (index in angry_array):
angry_counter += 1
if (index in surprised_array):
surprised_counter += 1
if (index in scared_array):
scared_counter += 1
if (index in sad_array):
sad_counter += 1
if (index in happy_array)::
happy_counter += 1
maxEmotion = max(happy_counter,sad_counter,angry_counter,surprised_counter,scared_counter)
emotion = ""
if (maxEmotion == 0):
emotion = "none"
if (maxEmotion == sad_counter):
emotion = "sad"
if (maxEmotion == happy_counter):
emotion = "happy"
if (maxEmotion == angry_counter):
emotion = "angry"
if (maxEmotion == scared_counter):
emotion = "scared"
if (maxEmotion == surprised_counter):
emotion = "surprised"
print (emotion)
My stream method just calls for the analysis method:
def message(self, pubnub, message):
print(emotionAnalysis(message.message['text']))
Thank you in advance!
Aucun commentaire:
Enregistrer un commentaire