dbcooper · August 6, 2020 07:34
diff --git a/spark_shakespeare_exercises.py b/spark_shakespeare_exercises.py
 # See http://psc.edu/images/xsedetraining/BigData/Intro_To_Spark.pdf

 rdd = sc.textFile("Complete_Shakespeare.txt")

 print('Count number of lines:')
 print(rdd.count())

 print('Count number of words:')
 words_rdd = rdd.flatMap(lambda x: x.split())
 print(words_rdd.count())

 print('Number of unique words:')
 unique_rdd = words_rdd.distinct()
 print(unique_rdd.count())

 print('Count occurance of each word:')
 kv_rdd = words_rdd.map(lambda x: (x, 1))
 occurance_rdd = kv_rdd.reduceByKey(lambda x,y: x + y)
 print(occurance_rdd.top(5))

 print("Top 5 most frequent words:")
 vk_rdd = occurance_rdd.map(lambda x: (x[1], x[0]))
 toplist_rdd = vk_rdd.sortByKey(False)
 print(toplist_rdd.map(lambda x: (x[1], x[0])).take(5))

 #
 # Niall's answers to homework problems
 #

 # Remove punctuation
 import re
 print('Unique word count after removing punctuation:')
 noalpha = re.compile('^[^a-zA-Z0-9]+$')
 somepunct = re.compile('[^a-zA-Z0-9]+')
 nopunct_rdd = words_rdd.filter(lambda x: noalpha.match(x) is None).map(lambda x: somepunct.sub('',x))
 print(nopunct_rdd.distinct().count())

 # Stop word list from https://gist.github.com/sebleier/554280
 stopwords = set( 'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now'.split() )

 print('Unique word count after further removing stop words:')
 cantstop_rdd = nopunct_rdd.filter(lambda x: x.lower() not in stopwords)
 print(cantstop_rdd.distinct().count())

 # Stemming
 import nltk
 from nltk.stem.porter import *
 stemmer = PorterStemmer()
 stems_rdd = cantstop_rdd.map(lambda x: stemmer.stem(x))
 print('Unique stem word count :')
 print(stems_rdd.distinct().count())

 # And now the most frequent word stems after we've thrown out punctuation and stop words
 print('The top 10 most frequent word stems after throwing out punctuation and stop words:')
 stems_kv_rdd = stems_rdd.map(lambda x: (x, 1))
 stems_occurance_rdd = stems_kv_rdd.reduceByKey(lambda x,y: x + y)
 stems_toplist_rdd = stems_occurance_rdd.map(lambda x: (x[1], x[0])).sortByKey(False)
 print(stems_toplist_rdd.take(10))
	# See http://psc.edu/images/xsedetraining/BigData/Intro_To_Spark.pdf

	rdd = sc.textFile("Complete_Shakespeare.txt")

	print('Count number of lines:')
	print(rdd.count())

	print('Count number of words:')
	words_rdd = rdd.flatMap(lambda x: x.split())
	print(words_rdd.count())

	print('Number of unique words:')
	unique_rdd = words_rdd.distinct()
	print(unique_rdd.count())

	print('Count occurance of each word:')
	kv_rdd = words_rdd.map(lambda x: (x, 1))
	occurance_rdd = kv_rdd.reduceByKey(lambda x,y: x + y)
	print(occurance_rdd.top(5))

	print("Top 5 most frequent words:")
	vk_rdd = occurance_rdd.map(lambda x: (x[1], x[0]))
	toplist_rdd = vk_rdd.sortByKey(False)
	print(toplist_rdd.map(lambda x: (x[1], x[0])).take(5))

	#
	# Niall's answers to homework problems
	#

	# Remove punctuation
	import re
	print('Unique word count after removing punctuation:')
	noalpha = re.compile('^[^a-zA-Z0-9]+$')
	somepunct = re.compile('[^a-zA-Z0-9]+')
	nopunct_rdd = words_rdd.filter(lambda x: noalpha.match(x) is None).map(lambda x: somepunct.sub('',x))
	print(nopunct_rdd.distinct().count())

	# Stop word list from https://gist.github.com/sebleier/554280
	stopwords = set( 'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now'.split() )

	print('Unique word count after further removing stop words:')
	cantstop_rdd = nopunct_rdd.filter(lambda x: x.lower() not in stopwords)
	print(cantstop_rdd.distinct().count())

	# Stemming
	import nltk
	from nltk.stem.porter import *
	stemmer = PorterStemmer()
	stems_rdd = cantstop_rdd.map(lambda x: stemmer.stem(x))
	print('Unique stem word count :')
	print(stems_rdd.distinct().count())

	# And now the most frequent word stems after we've thrown out punctuation and stop words
	print('The top 10 most frequent word stems after throwing out punctuation and stop words:')
	stems_kv_rdd = stems_rdd.map(lambda x: (x, 1))
	stems_occurance_rdd = stems_kv_rdd.reduceByKey(lambda x,y: x + y)
	stems_toplist_rdd = stems_occurance_rdd.map(lambda x: (x[1], x[0])).sortByKey(False)
	print(stems_toplist_rdd.take(10))
No results found