Learn foreign languages with Python

This is a script that I wrote to help read foreign languages. I'm no programmer, so please don't make fun of me.

It takes a regular text file as input, and creates a list of unique words for memory work. In fact, it creates three separate lists. The first is a list of unique words in each chapter (i.e. any block of text that starts with a number, but the criteria for identifying chapters can be easily changed). The second file is a list of unique words that occur at least twice. The third is a list of all vocabulary.

import re
import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

source_file = open('source_.txt')
out_file = open('vocab_per_chapter_sorted.txt', 'w')
out_file2 = open('all_vocab_2_or_more_occurences.txt', 'w')
out_file3 = open('all_vocab.txt', 'w')
source_string = source_file.read()
punctuation = '()?!\"\',.:;—'
no_punct = ''

for char in source_string:
    if char == '-':
        no_punct = no_punct + ' '
    elif char not in punctuation:
        no_punct = no_punct + char

source_array = no_punct.lower().split()

unique_words = []
this_chapter = []
two_or_more_occ = []
two_or_more_occ2 = []
unique_words_dict = {}
all_vocab = []

for each_word in source_array:
       
    find_chapter = re.search('([0-9]{1,3})', each_word)

    if find_chapter:
               
        for w in sorted(this_chapter, key=strip_accents): #sort words in each chapter
#        for w in this_chapter:
            out_file.write(w + '\n')
        chapter = int(find_chapter.group(1))       
        out_file.write(str(chapter) + '.\n') #export to first file                        
        this_chapter.clear()      
   
    else:      
        if each_word in unique_words and each_word not in two_or_more_occ:
            two_or_more_occ.append(each_word)           
            two_or_more_occ2.append(each_word + ', ' + str(unique_words_dict[each_word]))
                 
        if each_word not in unique_words:
            unique_words.append(each_word)
            this_chapter.append(each_word)
            unique_words_dict.update({each_word:chapter})
            all_vocab.append(each_word + ', ' + str(chapter))
       

for w in sorted(two_or_more_occ2, key=strip_accents):
    out_file2.write(w + '\n')

for w in sorted(all_vocab, key=strip_accents):
    out_file3.write(w + '\n')

out_file.close()
out_file2.close()
out_file3.close()

Comments

Popular posts from this blog

Classical Latin text-to-speech (tts)

Recording from more than once microphone in Linux

High-quality audio conferencing