Learn foreign languages with Python
This is a script that I wrote to help read foreign languages. I'm no programmer, so please don't make fun of me.
It takes a regular text file as input, and creates a list of unique words for memory work. In fact, it creates three separate lists. The first is a list of unique words in each chapter (i.e. any block of text that starts with a number, but the criteria for identifying chapters can be easily changed). The second file is a list of unique words that occur at least twice. The third is a list of all vocabulary.
import re
import unicodedata
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
source_file = open('source_.txt')
out_file = open('vocab_per_chapter_sorted.txt', 'w')
out_file2 = open('all_vocab_2_or_more_occurences.txt', 'w')
out_file3 = open('all_vocab.txt', 'w')
source_string = source_file.read()
punctuation = '()?!\"\',.:;—'
no_punct = ''
for char in source_string:
if char == '-':
no_punct = no_punct + ' '
elif char not in punctuation:
no_punct = no_punct + char
source_array = no_punct.lower().split()
unique_words = []
this_chapter = []
two_or_more_occ = []
two_or_more_occ2 = []
unique_words_dict = {}
all_vocab = []
for each_word in source_array:
find_chapter = re.search('([0-9]{1,3})', each_word)
if find_chapter:
for w in sorted(this_chapter, key=strip_accents): #sort words in each chapter
# for w in this_chapter:
out_file.write(w + '\n')
chapter = int(find_chapter.group(1))
out_file.write(str(chapter) + '.\n') #export to first file
this_chapter.clear()
else:
if each_word in unique_words and each_word not in two_or_more_occ:
two_or_more_occ.append(each_word)
two_or_more_occ2.append(each_word + ', ' + str(unique_words_dict[each_word]))
if each_word not in unique_words:
unique_words.append(each_word)
this_chapter.append(each_word)
unique_words_dict.update({each_word:chapter})
all_vocab.append(each_word + ', ' + str(chapter))
for w in sorted(two_or_more_occ2, key=strip_accents):
out_file2.write(w + '\n')
for w in sorted(all_vocab, key=strip_accents):
out_file3.write(w + '\n')
out_file.close()
out_file2.close()
out_file3.close()
It takes a regular text file as input, and creates a list of unique words for memory work. In fact, it creates three separate lists. The first is a list of unique words in each chapter (i.e. any block of text that starts with a number, but the criteria for identifying chapters can be easily changed). The second file is a list of unique words that occur at least twice. The third is a list of all vocabulary.
import re
import unicodedata
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
source_file = open('source_.txt')
out_file = open('vocab_per_chapter_sorted.txt', 'w')
out_file2 = open('all_vocab_2_or_more_occurences.txt', 'w')
out_file3 = open('all_vocab.txt', 'w')
source_string = source_file.read()
punctuation = '()?!\"\',.:;—'
no_punct = ''
for char in source_string:
if char == '-':
no_punct = no_punct + ' '
elif char not in punctuation:
no_punct = no_punct + char
source_array = no_punct.lower().split()
unique_words = []
this_chapter = []
two_or_more_occ = []
two_or_more_occ2 = []
unique_words_dict = {}
all_vocab = []
for each_word in source_array:
find_chapter = re.search('([0-9]{1,3})', each_word)
if find_chapter:
for w in sorted(this_chapter, key=strip_accents): #sort words in each chapter
# for w in this_chapter:
out_file.write(w + '\n')
chapter = int(find_chapter.group(1))
out_file.write(str(chapter) + '.\n') #export to first file
this_chapter.clear()
else:
if each_word in unique_words and each_word not in two_or_more_occ:
two_or_more_occ.append(each_word)
two_or_more_occ2.append(each_word + ', ' + str(unique_words_dict[each_word]))
if each_word not in unique_words:
unique_words.append(each_word)
this_chapter.append(each_word)
unique_words_dict.update({each_word:chapter})
all_vocab.append(each_word + ', ' + str(chapter))
for w in sorted(two_or_more_occ2, key=strip_accents):
out_file2.write(w + '\n')
for w in sorted(all_vocab, key=strip_accents):
out_file3.write(w + '\n')
out_file.close()
out_file2.close()
out_file3.close()
Comments
Post a Comment