Created
August 6, 2020 07:34
-
-
Save dbcooper/71b9dc529b47dedeef0157b4843a51df to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # See http://psc.edu/images/xsedetraining/BigData/Intro_To_Spark.pdf | |
| rdd = sc.textFile("Complete_Shakespeare.txt") | |
| print('Count number of lines:') | |
| print(rdd.count()) | |
| print('Count number of words:') | |
| words_rdd = rdd.flatMap(lambda x: x.split()) | |
| print(words_rdd.count()) | |
| print('Number of unique words:') | |
| unique_rdd = words_rdd.distinct() | |
| print(unique_rdd.count()) | |
| print('Count occurance of each word:') | |
| kv_rdd = words_rdd.map(lambda x: (x, 1)) | |
| occurance_rdd = kv_rdd.reduceByKey(lambda x,y: x + y) | |
| print(occurance_rdd.top(5)) | |
| print("Top 5 most frequent words:") | |
| vk_rdd = occurance_rdd.map(lambda x: (x[1], x[0])) | |
| toplist_rdd = vk_rdd.sortByKey(False) | |
| print(toplist_rdd.map(lambda x: (x[1], x[0])).take(5)) | |
| # | |
| # Niall's answers to homework problems | |
| # | |
| # Remove punctuation | |
| import re | |
| print('Unique word count after removing punctuation:') | |
| noalpha = re.compile('^[^a-zA-Z0-9]+$') | |
| somepunct = re.compile('[^a-zA-Z0-9]+') | |
| nopunct_rdd = words_rdd.filter(lambda x: noalpha.match(x) is None).map(lambda x: somepunct.sub('',x)) | |
| print(nopunct_rdd.distinct().count()) | |
| # Stop word list from https://gist.github.com/sebleier/554280 | |
| stopwords = set( 'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now'.split() ) | |
| print('Unique word count after further removing stop words:') | |
| cantstop_rdd = nopunct_rdd.filter(lambda x: x.lower() not in stopwords) | |
| print(cantstop_rdd.distinct().count()) | |
| # Stemming | |
| import nltk | |
| from nltk.stem.porter import * | |
| stemmer = PorterStemmer() | |
| stems_rdd = cantstop_rdd.map(lambda x: stemmer.stem(x)) | |
| print('Unique stem word count :') | |
| print(stems_rdd.distinct().count()) | |
| # And now the most frequent word stems after we've thrown out punctuation and stop words | |
| print('The top 10 most frequent word stems after throwing out punctuation and stop words:') | |
| stems_kv_rdd = stems_rdd.map(lambda x: (x, 1)) | |
| stems_occurance_rdd = stems_kv_rdd.reduceByKey(lambda x,y: x + y) | |
| stems_toplist_rdd = stems_occurance_rdd.map(lambda x: (x[1], x[0])).sortByKey(False) | |
| print(stems_toplist_rdd.take(10)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment