Created
April 25, 2018 14:33
-
-
Save sagarnanduunc/82acbef8a23fe368a78af052f3e591ac to your computer and use it in GitHub Desktop.
How to perform Structural Topic Modeling (stm) in R? This code works on Twitter Data but can be used by any corpus if it has a unique id field
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| loc <- "FILEPLATH /data.csv" | |
| tweets <- read_csv(loc) # fetching data | |
| # Data is already processed in Python but it can also be done in R | |
| # Data Processing steps: | |
| # All lower cased, No URLS, No stop words, No punctuations but # and @, | |
| library(quanteda) | |
| tweetCorpus <- corpus(tweets$body) # creating corpus | |
| docvars(tweetCorpus, field = "cov1") <- tweets$local # covariance field local | |
| docvars(tweetCorpus, field = "cov2") <- tweets$HotIssue # covariance field HotIssue | |
| docvars(tweetCorpus, "id") <- tweets$tweetId # adding this field to keep track of tweet ids | |
| # each date is converted into a number | |
| docvars(tweetCorpus, "cov3Time") <- as.integer(as.factor(substr(tweets$postedTime,1,10))) # covariance field postedtime | |
| # add words that you think are stopwords based on your corpus as well | |
| # Use term document frequency results to identify those | |
| stopWords <- c("t.co","http","https","amp","t","t.c","c","rt") # replacing stopwords if any | |
| # Converting corpus to document frequency matrix | |
| tweetDfm <- dfm(tweetCorpus, | |
| remove = c(stopwords("english"), stopWords), # replacing all those stop words | |
| ngrams= 1L, | |
| stem = F, # No stemming | |
| remove_numbers = TRUE, | |
| remove_punct = TRUE, | |
| remove_symbols = TRUE) %>% | |
| dfm_trim(min_count = 10, min_docfreq = 5) | |
| # min_count = remove words used less than x | |
| # min_docfreq = remove words used in less than x docs | |
| # showing top 50 words with highest frequency | |
| topfeatures(tweetDfm, n = 50) | |
| library(stm) | |
| # converting dfm to stmdfm | |
| stmtweetDfm <- convert(tweetDfm, to = "stm") | |
| # preparing out variable for ctm or stm model | |
| out <- prepDocuments(stmtweetDfm$documents, | |
| stmtweetDfm$vocab, | |
| stmtweetDfm$meta, | |
| lower.thresh = 10) # | |
| tweetsRemoved <- stmtweetDfm$meta$id[-out$docs.removed] # fetching tweet ids of all documents not removed | |
| tweetsCleaned <- subset(tweets,tweets$tweetId %in% tweetsRemoved) # fetching tweets that were not removed | |
| save(tweetsCleaned, file = "FILEPATH/tweetsCleaned.RData") | |
| k <- 40 # number of topics | |
| stmResult <- stm(out$documents, out$vocab, K = k,prevalence = ~cov1 + cov2 + s(cov3Time), | |
| max.em.its = 180, data = out$meta, init.type = "Spectral", seed = 300) | |
| save(stmResult, file = "FILEPATH/stmResult.Rdata") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment