Skip to content

Instantly share code, notes, and snippets.

@dbcooper
Created August 6, 2020 07:34
Show Gist options
  • Select an option

  • Save dbcooper/71b9dc529b47dedeef0157b4843a51df to your computer and use it in GitHub Desktop.

Select an option

Save dbcooper/71b9dc529b47dedeef0157b4843a51df to your computer and use it in GitHub Desktop.
# See http://psc.edu/images/xsedetraining/BigData/Intro_To_Spark.pdf
rdd = sc.textFile("Complete_Shakespeare.txt")
print('Count number of lines:')
print(rdd.count())
print('Count number of words:')
words_rdd = rdd.flatMap(lambda x: x.split())
print(words_rdd.count())
print('Number of unique words:')
unique_rdd = words_rdd.distinct()
print(unique_rdd.count())
print('Count occurance of each word:')
kv_rdd = words_rdd.map(lambda x: (x, 1))
occurance_rdd = kv_rdd.reduceByKey(lambda x,y: x + y)
print(occurance_rdd.top(5))
print("Top 5 most frequent words:")
vk_rdd = occurance_rdd.map(lambda x: (x[1], x[0]))
toplist_rdd = vk_rdd.sortByKey(False)
print(toplist_rdd.map(lambda x: (x[1], x[0])).take(5))
#
# Niall's answers to homework problems
#
# Remove punctuation
import re
print('Unique word count after removing punctuation:')
noalpha = re.compile('^[^a-zA-Z0-9]+$')
somepunct = re.compile('[^a-zA-Z0-9]+')
nopunct_rdd = words_rdd.filter(lambda x: noalpha.match(x) is None).map(lambda x: somepunct.sub('',x))
print(nopunct_rdd.distinct().count())
# Stop word list from https://gist.github.com/sebleier/554280
stopwords = set( 'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now'.split() )
print('Unique word count after further removing stop words:')
cantstop_rdd = nopunct_rdd.filter(lambda x: x.lower() not in stopwords)
print(cantstop_rdd.distinct().count())
# Stemming
import nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
stems_rdd = cantstop_rdd.map(lambda x: stemmer.stem(x))
print('Unique stem word count :')
print(stems_rdd.distinct().count())
# And now the most frequent word stems after we've thrown out punctuation and stop words
print('The top 10 most frequent word stems after throwing out punctuation and stop words:')
stems_kv_rdd = stems_rdd.map(lambda x: (x, 1))
stems_occurance_rdd = stems_kv_rdd.reduceByKey(lambda x,y: x + y)
stems_toplist_rdd = stems_occurance_rdd.map(lambda x: (x[1], x[0])).sortByKey(False)
print(stems_toplist_rdd.take(10))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment