I have a dataframe of jargon words, a dataframe of users and a dataframe of forum posts with text in R. The aim is to select a group of important users that have used the jargon words with a term frequency greater than 0.005 and have used jargon words more than 30 times in their posts. This is the code so far, but my ifelse clause does not take into account that the relevant words must come from each users.
Any help would be appreciated.
jargon_words = "man", "heavy", "today", "last_night", "total"
term_frequency_tf = "hello", "old", "today", "total"
unique_users = "Xman" "23mate", "Hslim", "jacob6"
forum_posts = "hi my name is jeff", "whatsup doc", "hi mate today"
#Convert jargon words to tibble
jargon_words = as.data.frame(jargon_words)
jargon_words = as_tibble(jargon_words)
#Rename the first jargon_words column
names(jargon_words)[1] = 'text'
jargon_words$text = as.character(jargon_words$text)
class(jargon_words$text)
#Stem jargon words
jargon_words = jargon_words %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word_stem = SnowballC::wordStem(word))
#Remove 'words' column
jargon_words = jargon_words[,-1]
jargon_words
#Convert jargon words to dataframe
jargon_words = as.data.frame(jargon_words)
#Select forum posts and add 'doc_id' column
forum_posts = df %>% select(1)
forum_posts$doc_id = seq.int(nrow(forum_posts))
#Rename the first forum_posts column
names(forum_posts)[1] = 'text'
#Convert to tibble
forum_posts = as_tibble(forum_posts)
#Remove punctuation
forum_posts$text = str_replace_all(forum_posts$text, "[[:punct:]]", " ")
#Remove digits
forum_posts$text = str_replace_all(forum_posts$text, "[[:digit:]]", " ")
#Trim whitespace
forum_posts$text = str_trim(forum_posts$text, side = c("both"))
#Calculate term frequency (tf) for all stemmed words
library(SnowballC)
term_frequency = forum_posts %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word_stem = SnowballC::wordStem(word)) %>%
group_by(doc_id) %>%
count(word_stem) %>%
group_by(doc_id) %>%
mutate(post_sum = sum(n)) %>%
bind_tf_idf(word_stem, post_sum, n) %>%
arrange(desc(tf))
#Convert to dataframe
term_frequency = as.data.frame(term_frequency)
#Select words with a term_frequency greater than 0.005
term_frequency_tf = term_frequency$word_stem[term_frequency$tf > 0.005]
#Select unique users
unique_users = levels(df$user)
#Select important users
important_users = vector()
for (user in unique_users){
condition = (df$user == user)
text = forum_posts$text[condition]
relevant_words = term_frequency_tf
x = intersect(jargon_words, relevant_words)
if (length(x) > 30){
print(user)
important_users = c(important_users, user)
}
}
Aucun commentaire:
Enregistrer un commentaire