Skip to content

Instantly share code, notes, and snippets.

@paperscissors
Created November 22, 2024 21:58
Show Gist options
  • Select an option

  • Save paperscissors/3575abf56ea12b0dd5c08c2125d4f95b to your computer and use it in GitHub Desktop.

Select an option

Save paperscissors/3575abf56ea12b0dd5c08c2125d4f95b to your computer and use it in GitHub Desktop.
import re
from collections import Counter
from tabulate import tabulate
import multiprocessing
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from langdetect import detect
import random
# Common stop words
STOP_WORDS = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
'will', 'with'
}
def process_chunk(chunk):
# Convert to lowercase and split into words from spaces
words = chunk.lower().split()
# Remove punctuation and non-alph characters
words = [re.sub(r'[^\w\s]', '', word) for word in words]
# Remove empty strings and stop words
words = [word for word in words if word and word not in STOP_WORDS]
# Count occurrences c;of each word
return Counter(words)
def process_text(text):
# Determine the number of CPU cores
num_cores = multiprocessing.cpu_count()
# Split the text into chunks
chunk_size = len(text) // num_cores
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
# Create a multiprocessing pool
with multiprocessing.Pool(processes=num_cores) as pool:
# Process chunks in parallel
results = pool.map(process_chunk, chunks)
# Combine results
combined_counts = Counter()
for result in results:
combined_counts.update(result)
return combined_counts
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
colors = ['#348888', '#22BABB', '#9EF8EE', '#FA7F08', '#F24405']
return random.choice(colors)
def generate_word_cloud(text, font_path=None):
if font_path and os.path.exists(font_path):
wordcloud = WordCloud(width=800, height=400,
background_color='white',
font_path=font_path,
color_func=color_func,
max_words=200).generate(text)
else:
wordcloud = WordCloud(width=800, height=400,
background_color='white',
color_func=color_func,
max_words=200).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
def detect_languages(text):
paragraphs = text.strip().split('\n\n')
language_data = []
for i, paragraph in enumerate(paragraphs, 1):
try:
lang = detect(paragraph)
language_data.append((i, lang, paragraph[:50] + '...'))
except:
language_data.append((i, 'Unknown', paragraph[:50] + '...'))
return language_data
def main():
text = """
Today I decided to visit the old town where I grew up. The narrow streets and old buildings evoked precious memories. I felt as if I had traveled back in time.
Astăzi am decis să vizitez vechiul oraș în care am copilărit. Străzile înguste și clădirile vechi mi-au trezit amintiri prețioase. M-am simțit ca și cum aș fi călătorit în timp.
Hoy decidí visitar la antigua ciudad donde crecí. Las calles estrechas y los edificios viejos despertaron recuerdos preciados. Me sentí como si hubiera viajado en el tiempo.
"""
language_data = detect_languages(text)
print("Language Detection Results:")
print(tabulate(language_data, headers=["Paragraph", "Detected Language", "Preview"], tablefmt="fancy_grid"))
print("\n")
result = process_text(text)
table_data = [(word, count) for word, count in result.items()]
print("Unique words and their # count (excluding stop words):")
print(tabulate(table_data, headers=["Word", "Count"], tablefmt="fancy_grid"))
print(f"\nTotal unique words (excluding stop words): {len(result)}")
font_path = "/font.tff" # Replace with the actual path to your font file
generate_word_cloud(' '.join(result.elements()), font_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment