抽取所有英文单词的基本形式,小写,计数。
python to_word_list.py path_to_text.txt min_word_count
e.g.
python to_word_list.py path_to_text.txt 2 will show words appear above twice.
| import os | |
| from collections import Counter | |
| import re | |
| import sys | |
| from nltk.stem import WordNetLemmatizer | |
| def clean1(txt): | |
| return re.sub('[^0-9a-zA-Z]+', ' ', txt) | |
| def clean(txt): | |
| return re.sub('[^a-zA-Z]+', ' ', txt) | |
| def load_data(path): | |
| input_file = os.path.join(path) | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| def stem_counter(wc): | |
| lemmatizer = WordNetLemmatizer() | |
| counter = {} | |
| for w in wc: | |
| c = wc.get(w) | |
| w2 = lemmatizer.lemmatize(w, 'n') | |
| if len(w2) == len(w): | |
| w2 = lemmatizer.lemmatize(w, 'v') | |
| # if len(w2) == len(w): | |
| # w2 = lemmatizer.lemmatize(w, 'j') | |
| if len(w2) == len(w): | |
| w2 = lemmatizer.lemmatize(w, 'r') | |
| w2 = w2.lower() | |
| counter[w2] = counter.get(w2, 0) + c | |
| return counter | |
| def sort_counter(wc, min_c=3, max_c=500, min_w_len=2): | |
| bucket = {} | |
| clst = set() | |
| for w in wc: | |
| if len(w) <= min_w_len: | |
| continue | |
| c = wc.get(w) | |
| if c < min_c or c > max_c: | |
| continue | |
| clst.add(c) | |
| lst = bucket.get(c) | |
| if lst is None: | |
| lst = bucket[c] = [] | |
| lst.append(w) | |
| words = [] | |
| for c in reversed(sorted(clst)): | |
| for w in sorted(bucket.get(c)): | |
| words.append((w, c)) | |
| return words | |
| def main(fname, min_c, max_c): | |
| txt = load_data(fname) | |
| words = [w.lower() for w in clean(txt).split()] | |
| wc = Counter(words) | |
| wc = stem_counter(wc) | |
| wclist = sort_counter(wc, min_c, max_c) | |
| prev_c = 0 | |
| for w, c in wclist: | |
| if c!= prev_c: | |
| prev_c = c | |
| print('#', c, '次') | |
| print(w) | |
| fname = sys.argv[1] | |
| min_c = len(sys.argv) > 2 and int(sys.argv[2]) or 0 | |
| max_c = len(sys.argv) > 3 and int(sys.argv[3]) or 1000 | |
| main(fname, min_c, max_c) |