| |
|
|
| import os |
| import re |
| from glob import glob |
| from functions import * |
|
|
| |
| |
| class database(object): |
| def __init__(self): |
| """ |
| """ |
| self.count_sentences_fr = {} |
| self.count_sentences_als = {} |
| self.count_words_fr = {} |
| self.count_words_als = {} |
|
|
| self.db = [] |
|
|
| |
| def count_sentences_words_als(self, line): |
| """ |
| Fill up the Alsacien dictionary of counts of sentences and words |
| """ |
| |
| if line in self.count_sentences_als.keys(): |
| self.count_sentences_als[line] = self.count_sentences_als[line] + 1 |
| else: |
| self.count_sentences_als[line] = 1 |
|
|
| |
| for word in line.split(): |
| if word in self.count_words_als.keys(): |
| self.count_words_als[word] = self.count_words_als[word] + 1 |
| else: |
| self.count_words_als[word] = 1 |
|
|
| |
| def count_sentences_words_fr(self, line): |
| """ |
| Fill up the French dictionary of counts of sentences and words |
| """ |
| |
| if line in self.count_sentences_fr.keys(): |
| self.count_sentences_fr[line] = self.count_sentences_fr[line] + 1 |
| else: |
| self.count_sentences_fr[line] = 1 |
|
|
| |
| for word in line.split(): |
| if word in self.count_words_fr.keys(): |
| self.count_words_fr[word] = self.count_words_fr[word] + 1 |
| else: |
| self.count_words_fr[word] = 1 |
|
|
| |
| def get_data_alsaimmer(self, display=False): |
| """ |
| Function to read the xml files from www.alsa-immer.eu |
| and extract the database |
| """ |
| for filename in glob("/content/drive/MyDrive/www.alsa-immer.eu/*xml") + glob("/content/drive/MyDrive/www.alsa-immer.eu/*/*xml") : |
| try: |
| fic = open(filename, 'r', encoding="utf-8") |
| line_als = fic.readline() |
| except UnicodeDecodeError: |
| fic = open(filename, 'r', encoding='ISO-8859-1') |
| line_als = fic.readline() |
| try: |
| while True: |
| if not len(line_als): |
| raise EOFError |
|
|
| if "<als>" in line_als: |
| if "<fr>" in line_als: |
| line_fr = line_als |
| else: |
| line_fr = fic.readline() |
| while "<fr>" not in line_fr: |
| line_fr = fic.readline() |
| if not len(line_fr): |
| raise EOFError |
| |
| |
| line_als_clean = extract_between_tags(line_als, "als") |
| line_fr_clean = extract_between_tags(line_fr, "fr") |
|
|
| if len(line_als_clean) and len(line_fr_clean): |
| |
| self.count_sentences_words_als(line_als_clean) |
| |
| self.count_sentences_words_fr(line_fr_clean) |
|
|
| |
| self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) |
| |
|
|
| |
| line_als = fic.readline() |
| except EOFError: |
| fic.close() |
|
|
| if display: |
| print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
|
|
| |
| def get_data_alsatext(self, display=False): |
| """ |
| Script to read the file www.alsatext.eu/cours_grammaire.php |
| and extract the database. |
| """ |
|
|
| filename="/content/drive/MyDrive/www.alsatext.eu/cours_grammaire.php" |
| fic = open(filename, 'rt', encoding='utf8') |
| try: |
| while True: |
| line = fic.readline() |
| if not len(line): |
| raise EOFError |
|
|
| if "<ex_als>" in line and "<ex_fr>" in line: |
| line_als = extract_between_tags(line, 'ex_als') |
| line_fr = extract_between_tags(line, 'ex_fr') |
|
|
| |
| line_als_clean = clean_line(remove_html_tags(line_als)) |
| line_fr_clean = clean_line(remove_html_tags(line_fr)) |
|
|
| |
| self.count_sentences_words_als(line_als_clean) |
| |
| self.count_sentences_words_fr(line_fr_clean) |
|
|
| |
| self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) |
| except EOFError: |
| fic.close() |
|
|
| if display: |
| print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
|
|
| |
| def get_data_motsAlsacienMulhouse(self, display=False): |
| """ |
| Script to extract data from mots_alsacien_Mulhouse.csv |
| """ |
| filename="mots_alsacien_Mulhouse.csv" |
| fic=open("/content/drive/MyDrive/%s"%filename, 'rt', encoding="iso-8859-1") |
| fic.readline() |
| try: |
| while True: |
| line = fic.readline() |
| if not len(line): |
| raise EOFError |
|
|
| |
| self.count_sentences_words_als(line.split(";")[0]) |
| |
| self.count_sentences_words_fr(line.split(";")[1]) |
|
|
| |
| self.db.append({'fr':line.split(";")[1], 'als':line.split(";")[0]}) |
| |
| except EOFError: |
| fic.close() |
|
|
| if display: |
| print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
|
|
|
|
| |
| def get_data_alignments(self, display=False): |
| """ |
| Script to extract data from alignments.csv |
| """ |
| filename="alignments.csv" |
| fic=open("/content/drive/MyDrive/%s"%filename, 'rt') |
| fic.readline() |
| try: |
| while True: |
| line = fic.readline() |
| if not len(line): |
| raise EOFError |
|
|
| als = line.split('\t')[0].split()[0].split(';')[0] |
| fr = line.split('\t')[2].split()[0].split(';')[0] |
|
|
| |
| self.count_sentences_words_als(als) |
| |
| self.count_sentences_words_fr(fr) |
|
|
| |
| self.db.append({'fr':fr, 'als':als}) |
| except EOFError: |
| fic.close() |
|
|
| if display: |
| print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
|
|
|
|
|
|
| |
| def get_data_lexique(self, display=False): |
| """ |
| Read the 'lexique_*.pdf' and extract the data |
| Create .html using `pdftohtml .pdf` command |
| """ |
|
|
| filename="../lexique_artisans" |
|
|
| os.system('pdftohtml -q %s.pdf'%filename) |
| os.system('rm %s-* %s_* %s.html'%(filename, filename, filename)) |
|
|
| fic=open("%ss.html"%filename, 'rt') |
| start = False |
| N = 0 |
| try: |
| while True: |
| line = fic.readline() |
| if not len(line) or N>5: |
| raise EOFError |
|
|
| if '<body>' in line: |
| start = True |
| if start: |
|
|
| |
| if " <br/>\n" in line and len(clean_html(line.split(' <br/>')[0])): |
| line_fr = clean_html(line.split(' <br/>')[0]) |
| line_als = line |
| while " </i><br/>\n" not in line_als: |
| line_als = clean_html(extract_between_tags(fic.readline(), 'i')) |
| |
| |
|
|
| |
| self.db.append({'fr':line_fr, 'als':line_als}) |
| print({'fr':line_fr, 'als':line_als}) |
| N = N +1 |
|
|
| |
| self.count_sentences_words_als(line_als) |
| |
| self.count_sentences_words_fr(line_fr) |
| break |
|
|
| |
| |
|
|
| |
| except EOFError: |
| fic.close() |
|
|
|
|
| if display: |
| print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
|
|
|
|
| |
| def create_db(self): |
| """ |
| Create a dictionary for each sentence: |
| {'fr': 'ksjdfdk', 'als':'rtefv'} |
| """ |
| |
|
|
|
|
|
|
|
|