Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save sagarnanduunc/82acbef8a23fe368a78af052f3e591ac to your computer and use it in GitHub Desktop.

Select an option

Save sagarnanduunc/82acbef8a23fe368a78af052f3e591ac to your computer and use it in GitHub Desktop.
How to perform Structural Topic Modeling (stm) in R? This code works on Twitter Data but can be used by any corpus if it has a unique id field
library(tidyverse)
loc <- "FILEPLATH /data.csv"
tweets <- read_csv(loc) # fetching data
# Data is already processed in Python but it can also be done in R
# Data Processing steps:
# All lower cased, No URLS, No stop words, No punctuations but # and @,
library(quanteda)
tweetCorpus <- corpus(tweets$body) # creating corpus
docvars(tweetCorpus, field = "cov1") <- tweets$local # covariance field local
docvars(tweetCorpus, field = "cov2") <- tweets$HotIssue # covariance field HotIssue
docvars(tweetCorpus, "id") <- tweets$tweetId # adding this field to keep track of tweet ids
# each date is converted into a number
docvars(tweetCorpus, "cov3Time") <- as.integer(as.factor(substr(tweets$postedTime,1,10))) # covariance field postedtime
# add words that you think are stopwords based on your corpus as well
# Use term document frequency results to identify those
stopWords <- c("t.co","http","https","amp","t","t.c","c","rt") # replacing stopwords if any
# Converting corpus to document frequency matrix
tweetDfm <- dfm(tweetCorpus,
remove = c(stopwords("english"), stopWords), # replacing all those stop words
ngrams= 1L,
stem = F, # No stemming
remove_numbers = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE) %>%
dfm_trim(min_count = 10, min_docfreq = 5)
# min_count = remove words used less than x
# min_docfreq = remove words used in less than x docs
# showing top 50 words with highest frequency
topfeatures(tweetDfm, n = 50)
library(stm)
# converting dfm to stmdfm
stmtweetDfm <- convert(tweetDfm, to = "stm")
# preparing out variable for ctm or stm model
out <- prepDocuments(stmtweetDfm$documents,
stmtweetDfm$vocab,
stmtweetDfm$meta,
lower.thresh = 10) #
tweetsRemoved <- stmtweetDfm$meta$id[-out$docs.removed] # fetching tweet ids of all documents not removed
tweetsCleaned <- subset(tweets,tweets$tweetId %in% tweetsRemoved) # fetching tweets that were not removed
save(tweetsCleaned, file = "FILEPATH/tweetsCleaned.RData")
k <- 40 # number of topics
stmResult <- stm(out$documents, out$vocab, K = k,prevalence = ~cov1 + cov2 + s(cov3Time),
max.em.its = 180, data = out$meta, init.type = "Spectral", seed = 300)
save(stmResult, file = "FILEPATH/stmResult.Rdata")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment