Skip to content

Instantly share code, notes, and snippets.

@VIEWVIEWVIEW
Last active December 2, 2022 16:02
Show Gist options
  • Select an option

  • Save VIEWVIEWVIEW/f75bd8499cd04faeabdd84bcf75a7349 to your computer and use it in GitHub Desktop.

Select an option

Save VIEWVIEWVIEW/f75bd8499cd04faeabdd84bcf75a7349 to your computer and use it in GitHub Desktop.
import queue
import duden
from pymongo import MongoClient
from crayons import yellow, red, green, white, blue
import os
conn_str = "mongodb://localhost:27017/NLP"
client = MongoClient()
db = client.NLP.duden
q = queue.Queue()
current_word_set = { "" }
def add_to_queue (word):
if word not in current_word_set:
word = word.replace('\xad', '').strip()
q.put(word)
current_word_set.add(word)
print(blue("Queued: "), word)
word = word.replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('Ä', 'Ae').replace('Ö', 'Oe').replace('Ü', 'Ue').replace('ß', 'ss')
if word not in current_word_set:
q.put(word)
current_word_set.add(word)
word = word.replace('-', '_')
if word not in current_word_set:
q.put(word)
current_word_set.add(word)
def add_word(w):
word = None
doesWordExistInDb = db.find_one({ "name": w })
if doesWordExistInDb and not q.empty():
print(yellow("Exists: ", bold=True), w)
return
try:
print("Getting: " + w)
word = duden.get(w)
except:
print(red("RATELIMITED", bold=True), w)
os.remove("queue.txt")
while not q.empty():
with open("queue.txt", "a", encoding="utf-8") as f:
f.write(q.get() + '\n')
if word is None:
print(red("Word is null: ") + w)
with open("null.txt", "a", encoding="utf-8") as f:
f.write(w + '\n')
return
for tuple in word.before_after_structure['Im Alphabet davor']:
add_to_queue(tuple[1])
for tuple in word.before_after_structure['Im Alphabet danach']:
add_to_queue(tuple[1])
doesWordExistInDb = db.find_one({ "name": word.name })
if doesWordExistInDb:
print(yellow("Exists: ", bold=True), word.name)
return
try:
db.insert_one(
word.export()
)
print(green("Added: ", bold=True), word.name)
with open("last.txt", "w", encoding="utf-8") as f:
f.write(word.name)
except:
print(red("Failed: ", bold=True) + w)
with open("failed.txt", "a", encoding="utf-8") as f:
f.write(w + '\n')
with open('last.txt') as f:
for line in f:
pass
last_line = line
q.put(last_line)
while not q.empty():
add_word(q.get())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment