sagarnanduunc · April 25, 2018 14:33
diff --git a/Data Pipleline for Structural Topic Modeling in R for Twitter Data b/Data Pipleline for Structural Topic Modeling in R for Twitter Data
 library(tidyverse)
 loc <- "FILEPLATH /data.csv"
 tweets <- read_csv(loc) # fetching data

 # Data is already processed in Python but it can also be done in R
 # Data Processing steps:
 # All lower cased, No URLS, No stop words, No punctuations but # and @, 

 library(quanteda) 

 tweetCorpus <- corpus(tweets$body) # creating corpus
 docvars(tweetCorpus, field = "cov1") <- tweets$local # covariance field local
 docvars(tweetCorpus, field = "cov2") <- tweets$HotIssue # covariance field HotIssue
 docvars(tweetCorpus, "id") <- tweets$tweetId # adding this field to keep track of tweet ids
 # each date is converted into a number
 docvars(tweetCorpus, "cov3Time") <- as.integer(as.factor(substr(tweets$postedTime,1,10))) # covariance field postedtime

 # add words that you think are stopwords based on your corpus as well
 # Use term document frequency results to identify those
 stopWords <- c("t.co","http","https","amp","t","t.c","c","rt") # replacing stopwords if any

 # Converting corpus to document frequency matrix
 tweetDfm <- dfm(tweetCorpus,
           remove = c(stopwords("english"), stopWords), # replacing all those stop words
           ngrams= 1L,
           stem = F, # No stemming
           remove_numbers = TRUE,  
           remove_punct = TRUE, 
           remove_symbols = TRUE) %>%
  dfm_trim(min_count = 10, min_docfreq = 5) 
 # min_count = remove words used less than x
 # min_docfreq = remove words used in less than x docs

 # showing top 50 words with highest frequency
 topfeatures(tweetDfm, n = 50)

 library(stm)

 # converting dfm to stmdfm
 stmtweetDfm <- convert(tweetDfm, to = "stm")

 # preparing out variable for ctm or stm model
 out <- prepDocuments(stmtweetDfm$documents, 
                               stmtweetDfm$vocab, 
                               stmtweetDfm$meta, 
                     lower.thresh = 10) # 


 tweetsRemoved <- stmtweetDfm$meta$id[-out$docs.removed] # fetching tweet ids of all documents not removed
 tweetsCleaned <- subset(tweets,tweets$tweetId %in% tweetsRemoved) # fetching tweets that were not removed
 save(tweetsCleaned, file = "FILEPATH/tweetsCleaned.RData")


 k <- 40 # number of topics
 stmResult <- stm(out$documents, out$vocab, K = k,prevalence = ~cov1 + cov2 + s(cov3Time),
                          max.em.its = 180, data = out$meta, init.type = "Spectral", seed = 300)

 save(stmResult, file = "FILEPATH/stmResult.Rdata")
	library(tidyverse)
	loc <- "FILEPLATH /data.csv"
	tweets <- read_csv(loc) # fetching data

	# Data is already processed in Python but it can also be done in R
	# Data Processing steps:
	# All lower cased, No URLS, No stop words, No punctuations but # and @,

	library(quanteda)

	tweetCorpus <- corpus(tweets$body) # creating corpus
	docvars(tweetCorpus, field = "cov1") <- tweets$local # covariance field local
	docvars(tweetCorpus, field = "cov2") <- tweets$HotIssue # covariance field HotIssue
	docvars(tweetCorpus, "id") <- tweets$tweetId # adding this field to keep track of tweet ids
	# each date is converted into a number
	docvars(tweetCorpus, "cov3Time") <- as.integer(as.factor(substr(tweets$postedTime,1,10))) # covariance field postedtime

	# add words that you think are stopwords based on your corpus as well
	# Use term document frequency results to identify those
	stopWords <- c("t.co","http","https","amp","t","t.c","c","rt") # replacing stopwords if any

	# Converting corpus to document frequency matrix
	tweetDfm <- dfm(tweetCorpus,
	remove = c(stopwords("english"), stopWords), # replacing all those stop words
	ngrams= 1L,
	stem = F, # No stemming
	remove_numbers = TRUE,
	remove_punct = TRUE,
	remove_symbols = TRUE) %>%
	dfm_trim(min_count = 10, min_docfreq = 5)
	# min_count = remove words used less than x
	# min_docfreq = remove words used in less than x docs

	# showing top 50 words with highest frequency
	topfeatures(tweetDfm, n = 50)

	library(stm)

	# converting dfm to stmdfm
	stmtweetDfm <- convert(tweetDfm, to = "stm")

	# preparing out variable for ctm or stm model
	out <- prepDocuments(stmtweetDfm$documents,
	stmtweetDfm$vocab,
	stmtweetDfm$meta,
	lower.thresh = 10) #


	tweetsRemoved <- stmtweetDfm$meta$id[-out$docs.removed] # fetching tweet ids of all documents not removed
	tweetsCleaned <- subset(tweets,tweets$tweetId %in% tweetsRemoved) # fetching tweets that were not removed
	save(tweetsCleaned, file = "FILEPATH/tweetsCleaned.RData")


	k <- 40 # number of topics
	stmResult <- stm(out$documents, out$vocab, K = k,prevalence = ~cov1 + cov2 + s(cov3Time),
	max.em.its = 180, data = out$meta, init.type = "Spectral", seed = 300)

	save(stmResult, file = "FILEPATH/stmResult.Rdata")
No results found