sagarnanduunc · March 28, 2018 04:54
diff --git a/function to cleaning body of tweet b/function to cleaning body of tweet
 import nltk
 import string
 from nltk.tokenize import TweetTokenizer
 tknz = TweetTokenizer()
 from nltk.corpus import stopwords
 stop = stopwords.words('english') + list(string.punctuation)

 translator = str.maketrans('', '', string.punctuation.replace("#","").replace("@","").replace("'",""))
 def cleanTweet(text):
    text = (re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", text)).lower() #removes urls
    text = re.sub(r'[^\x00-\x7F]+',' ', text) # removes unicodes (emogis)
    text = re.sub(r'(\r)|(\n)','',text) # removes newline characters
    text= text.translate(translator) # removes punctuations except ''', '#' and '@'
    tokens = tknz.tokenize(text)
    temp=[]
    for i in tokens:
        if not i in stop:
            temp.append(i)
    
    return ' '.join(temp)
    
 # Use this as a lambda function when cleaning body in a pandas dataframe:
 df["body"] = df["body"].apply(lambda x: cleanTweet(x))
	import nltk
	import string
	from nltk.tokenize import TweetTokenizer
	tknz = TweetTokenizer()
	from nltk.corpus import stopwords
	stop = stopwords.words('english') + list(string.punctuation)

	translator = str.maketrans('', '', string.punctuation.replace("#","").replace("@","").replace("'",""))
	def cleanTweet(text):
	text = (re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*", "", text)).lower() #removes urls
	text = re.sub(r'[^\x00-\x7F]+',' ', text) # removes unicodes (emogis)
	text = re.sub(r'(\r)\|(\n)','',text) # removes newline characters
	text= text.translate(translator) # removes punctuations except ''', '#' and '@'
	tokens = tknz.tokenize(text)
	temp=[]
	for i in tokens:
	if not i in stop:
	temp.append(i)

	return ' '.join(temp)

	# Use this as a lambda function when cleaning body in a pandas dataframe:
	df["body"] = df["body"].apply(lambda x: cleanTweet(x))
No results found