Last active
December 2, 2022 16:02
-
-
Save VIEWVIEWVIEW/f75bd8499cd04faeabdd84bcf75a7349 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import queue | |
| import duden | |
| from pymongo import MongoClient | |
| from crayons import yellow, red, green, white, blue | |
| import os | |
| conn_str = "mongodb://localhost:27017/NLP" | |
| client = MongoClient() | |
| db = client.NLP.duden | |
| q = queue.Queue() | |
| current_word_set = { "" } | |
| def add_to_queue (word): | |
| if word not in current_word_set: | |
| word = word.replace('\xad', '').strip() | |
| q.put(word) | |
| current_word_set.add(word) | |
| print(blue("Queued: "), word) | |
| word = word.replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('Ä', 'Ae').replace('Ö', 'Oe').replace('Ü', 'Ue').replace('ß', 'ss') | |
| if word not in current_word_set: | |
| q.put(word) | |
| current_word_set.add(word) | |
| word = word.replace('-', '_') | |
| if word not in current_word_set: | |
| q.put(word) | |
| current_word_set.add(word) | |
| def add_word(w): | |
| word = None | |
| doesWordExistInDb = db.find_one({ "name": w }) | |
| if doesWordExistInDb and not q.empty(): | |
| print(yellow("Exists: ", bold=True), w) | |
| return | |
| try: | |
| print("Getting: " + w) | |
| word = duden.get(w) | |
| except: | |
| print(red("RATELIMITED", bold=True), w) | |
| os.remove("queue.txt") | |
| while not q.empty(): | |
| with open("queue.txt", "a", encoding="utf-8") as f: | |
| f.write(q.get() + '\n') | |
| if word is None: | |
| print(red("Word is null: ") + w) | |
| with open("null.txt", "a", encoding="utf-8") as f: | |
| f.write(w + '\n') | |
| return | |
| for tuple in word.before_after_structure['Im Alphabet davor']: | |
| add_to_queue(tuple[1]) | |
| for tuple in word.before_after_structure['Im Alphabet danach']: | |
| add_to_queue(tuple[1]) | |
| doesWordExistInDb = db.find_one({ "name": word.name }) | |
| if doesWordExistInDb: | |
| print(yellow("Exists: ", bold=True), word.name) | |
| return | |
| try: | |
| db.insert_one( | |
| word.export() | |
| ) | |
| print(green("Added: ", bold=True), word.name) | |
| with open("last.txt", "w", encoding="utf-8") as f: | |
| f.write(word.name) | |
| except: | |
| print(red("Failed: ", bold=True) + w) | |
| with open("failed.txt", "a", encoding="utf-8") as f: | |
| f.write(w + '\n') | |
| with open('last.txt') as f: | |
| for line in f: | |
| pass | |
| last_line = line | |
| q.put(last_line) | |
| while not q.empty(): | |
| add_word(q.get()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment