paperscissors · November 22, 2024 21:58
diff --git a/basic_nlp.py b/basic_nlp.py
 import re
 from collections import Counter
 from tabulate import tabulate
 import multiprocessing
 import os
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from langdetect import detect
 import random


 # Common stop words
 STOP_WORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
    'will', 'with'
 }

 def process_chunk(chunk):
    # Convert to lowercase and split into words from spaces
    words = chunk.lower().split()
    
    # Remove punctuation and non-alph characters
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    
    # Remove empty strings and stop words
    words = [word for word in words if word and word not in STOP_WORDS]
    
    # Count occurrences c;of each word
    return Counter(words)

 def process_text(text):
    # Determine the number of CPU cores
    num_cores = multiprocessing.cpu_count()
    
    # Split the text into chunks
    chunk_size = len(text) // num_cores
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    
    # Create a multiprocessing pool
    with multiprocessing.Pool(processes=num_cores) as pool:
        # Process chunks in parallel
        results = pool.map(process_chunk, chunks)
    
    # Combine results
    combined_counts = Counter()
    for result in results:
        combined_counts.update(result)
    
    return combined_counts

 def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    colors = ['#348888', '#22BABB', '#9EF8EE', '#FA7F08', '#F24405']
    return random.choice(colors)

 def generate_word_cloud(text, font_path=None):
    if font_path and os.path.exists(font_path):
        wordcloud = WordCloud(width=800, height=400, 
                              background_color='white', 
                              font_path=font_path,
                              color_func=color_func,
                              max_words=200).generate(text)
    else:
        wordcloud = WordCloud(width=800, height=400, 
                              background_color='white',
                              color_func=color_func,
                              max_words=200).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

 def detect_languages(text):
    paragraphs = text.strip().split('\n\n')
    language_data = []
    
    for i, paragraph in enumerate(paragraphs, 1):
        try:
            lang = detect(paragraph)
            language_data.append((i, lang, paragraph[:50] + '...'))
        except:
            language_data.append((i, 'Unknown', paragraph[:50] + '...'))
    
    return language_data

 def main():
    text = """
    Today I decided to visit the old town where I grew up. The narrow streets and old buildings evoked precious memories. I felt as if I had traveled back in time.

    Astăzi am decis să vizitez vechiul oraș în care am copilărit. Străzile înguste și clădirile vechi mi-au trezit amintiri prețioase. M-am simțit ca și cum aș fi călătorit în timp.

    Hoy decidí visitar la antigua ciudad donde crecí. Las calles estrechas y los edificios viejos despertaron recuerdos preciados. Me sentí como si hubiera viajado en el tiempo.
    """
    
    language_data = detect_languages(text)
    print("Language Detection Results:")
    print(tabulate(language_data, headers=["Paragraph", "Detected Language", "Preview"], tablefmt="fancy_grid"))
    print("\n")

    result = process_text(text)
    
    table_data = [(word, count) for word, count in result.items()]
    
    print("Unique words and their # count (excluding stop words):")
    print(tabulate(table_data, headers=["Word", "Count"], tablefmt="fancy_grid"))
    
    print(f"\nTotal unique words (excluding stop words): {len(result)}")

    font_path = "/font.tff"  # Replace with the actual path to your font file

    generate_word_cloud(' '.join(result.elements()), font_path)



 if __name__ == "__main__":
    main()
	import re
	from collections import Counter
	from tabulate import tabulate
	import multiprocessing
	import os
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from langdetect import detect
	import random


	# Common stop words
	STOP_WORDS = {
	'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
	'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
	'will', 'with'
	}

	def process_chunk(chunk):
	# Convert to lowercase and split into words from spaces
	words = chunk.lower().split()

	# Remove punctuation and non-alph characters
	words = [re.sub(r'[^\w\s]', '', word) for word in words]

	# Remove empty strings and stop words
	words = [word for word in words if word and word not in STOP_WORDS]

	# Count occurrences c;of each word
	return Counter(words)

	def process_text(text):
	# Determine the number of CPU cores
	num_cores = multiprocessing.cpu_count()

	# Split the text into chunks
	chunk_size = len(text) // num_cores
	chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

	# Create a multiprocessing pool
	with multiprocessing.Pool(processes=num_cores) as pool:
	# Process chunks in parallel
	results = pool.map(process_chunk, chunks)

	# Combine results
	combined_counts = Counter()
	for result in results:
	combined_counts.update(result)

	return combined_counts

	def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
	colors = ['#348888', '#22BABB', '#9EF8EE', '#FA7F08', '#F24405']
	return random.choice(colors)

	def generate_word_cloud(text, font_path=None):
	if font_path and os.path.exists(font_path):
	wordcloud = WordCloud(width=800, height=400,
	background_color='white',
	font_path=font_path,
	color_func=color_func,
	max_words=200).generate(text)
	else:
	wordcloud = WordCloud(width=800, height=400,
	background_color='white',
	color_func=color_func,
	max_words=200).generate(text)

	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	plt.tight_layout(pad=0)
	plt.show()

	def detect_languages(text):
	paragraphs = text.strip().split('\n\n')
	language_data = []

	for i, paragraph in enumerate(paragraphs, 1):
	try:
	lang = detect(paragraph)
	language_data.append((i, lang, paragraph[:50] + '...'))
	except:
	language_data.append((i, 'Unknown', paragraph[:50] + '...'))

	return language_data

	def main():
	text = """
	Today I decided to visit the old town where I grew up. The narrow streets and old buildings evoked precious memories. I felt as if I had traveled back in time.

	Astăzi am decis să vizitez vechiul oraș în care am copilărit. Străzile înguste și clădirile vechi mi-au trezit amintiri prețioase. M-am simțit ca și cum aș fi călătorit în timp.

	Hoy decidí visitar la antigua ciudad donde crecí. Las calles estrechas y los edificios viejos despertaron recuerdos preciados. Me sentí como si hubiera viajado en el tiempo.
	"""

	language_data = detect_languages(text)
	print("Language Detection Results:")
	print(tabulate(language_data, headers=["Paragraph", "Detected Language", "Preview"], tablefmt="fancy_grid"))
	print("\n")

	result = process_text(text)

	table_data = [(word, count) for word, count in result.items()]

	print("Unique words and their # count (excluding stop words):")
	print(tabulate(table_data, headers=["Word", "Count"], tablefmt="fancy_grid"))

	print(f"\nTotal unique words (excluding stop words): {len(result)}")

	font_path = "/font.tff" # Replace with the actual path to your font file

	generate_word_cloud(' '.join(result.elements()), font_path)



	if __name__ == "__main__":
	main()
No results found