Created
November 22, 2024 21:58
-
-
Save paperscissors/3575abf56ea12b0dd5c08c2125d4f95b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| from collections import Counter | |
| from tabulate import tabulate | |
| import multiprocessing | |
| import os | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from langdetect import detect | |
| import random | |
| # Common stop words | |
| STOP_WORDS = { | |
| 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', | |
| 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', | |
| 'will', 'with' | |
| } | |
| def process_chunk(chunk): | |
| # Convert to lowercase and split into words from spaces | |
| words = chunk.lower().split() | |
| # Remove punctuation and non-alph characters | |
| words = [re.sub(r'[^\w\s]', '', word) for word in words] | |
| # Remove empty strings and stop words | |
| words = [word for word in words if word and word not in STOP_WORDS] | |
| # Count occurrences c;of each word | |
| return Counter(words) | |
| def process_text(text): | |
| # Determine the number of CPU cores | |
| num_cores = multiprocessing.cpu_count() | |
| # Split the text into chunks | |
| chunk_size = len(text) // num_cores | |
| chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| # Create a multiprocessing pool | |
| with multiprocessing.Pool(processes=num_cores) as pool: | |
| # Process chunks in parallel | |
| results = pool.map(process_chunk, chunks) | |
| # Combine results | |
| combined_counts = Counter() | |
| for result in results: | |
| combined_counts.update(result) | |
| return combined_counts | |
| def color_func(word, font_size, position, orientation, random_state=None, **kwargs): | |
| colors = ['#348888', '#22BABB', '#9EF8EE', '#FA7F08', '#F24405'] | |
| return random.choice(colors) | |
| def generate_word_cloud(text, font_path=None): | |
| if font_path and os.path.exists(font_path): | |
| wordcloud = WordCloud(width=800, height=400, | |
| background_color='white', | |
| font_path=font_path, | |
| color_func=color_func, | |
| max_words=200).generate(text) | |
| else: | |
| wordcloud = WordCloud(width=800, height=400, | |
| background_color='white', | |
| color_func=color_func, | |
| max_words=200).generate(text) | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.tight_layout(pad=0) | |
| plt.show() | |
| def detect_languages(text): | |
| paragraphs = text.strip().split('\n\n') | |
| language_data = [] | |
| for i, paragraph in enumerate(paragraphs, 1): | |
| try: | |
| lang = detect(paragraph) | |
| language_data.append((i, lang, paragraph[:50] + '...')) | |
| except: | |
| language_data.append((i, 'Unknown', paragraph[:50] + '...')) | |
| return language_data | |
| def main(): | |
| text = """ | |
| Today I decided to visit the old town where I grew up. The narrow streets and old buildings evoked precious memories. I felt as if I had traveled back in time. | |
| Astăzi am decis să vizitez vechiul oraș în care am copilărit. Străzile înguste și clădirile vechi mi-au trezit amintiri prețioase. M-am simțit ca și cum aș fi călătorit în timp. | |
| Hoy decidí visitar la antigua ciudad donde crecí. Las calles estrechas y los edificios viejos despertaron recuerdos preciados. Me sentí como si hubiera viajado en el tiempo. | |
| """ | |
| language_data = detect_languages(text) | |
| print("Language Detection Results:") | |
| print(tabulate(language_data, headers=["Paragraph", "Detected Language", "Preview"], tablefmt="fancy_grid")) | |
| print("\n") | |
| result = process_text(text) | |
| table_data = [(word, count) for word, count in result.items()] | |
| print("Unique words and their # count (excluding stop words):") | |
| print(tabulate(table_data, headers=["Word", "Count"], tablefmt="fancy_grid")) | |
| print(f"\nTotal unique words (excluding stop words): {len(result)}") | |
| font_path = "/font.tff" # Replace with the actual path to your font file | |
| generate_word_cloud(' '.join(result.elements()), font_path) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment