Boulbaba's picture
Upload 210 files
21baa2f verified
'''
Created on ١١‏/٠٥‏/٢٠١٠
@Created by: Muhammad Altabba
'''
from Models.Lexicon.RootsAndPatternsRepository import *;
from Models.Lexicon.SpecialWords.StandAloneParticle import *;
from Models.Lexicon.SpecialWords.ProperNoun import *;
from Controllers.TextEntities.TextEncapsulator import *;
from Controllers.TextEntities.Word import *;
from Controllers.Tokenization.Tokenizer import *;
from Controllers.Normalization.Normalizer import *;
from Controllers.Morphology.AffixParser import *;
from Controllers.Morphology.MorphologicalAnalyzer import *;
import codecs;
import io;
import os;
from os.path import join, getsize;
compoundNounsXmlFile = '../../Data/MorphologyTransducers/Proclitics.xml';
procliticsXmlFile = '../../Data/MorphologyTransducers/Proclitics.xml';
encliticsXmlFile = '../../Data/MorphologyTransducers/Enclitics.xml';
prematureTaggingRulesXmlFile = '../../Data/TaggingRepository/PrematureTaggingRules.xml';
overdueTaggingRulesXmlFile = '../../Data/TaggingRepository/OverdueTaggingRules.xml';
baseDirectoryOfAlKhalil = 'D:/temp/AlKhalil_1/db/'
rootsFolder = 'roots2'
text = TextEncapsulator();
text.LoadFromFiles(baseDirectoryOfAlKhalil, rootsFolder, \
procliticsXmlFile, encliticsXmlFile,\
prematureTaggingRulesXmlFile, \
overdueTaggingRulesXmlFile);
base = 'D:/temp/Latifa2/'
for root, dirs, files in os.walk(base):
for dir in dirs:
print('Start parsing directory: ['+dir+']');
for subroot, subdirs, subfiles in os.walk(root+dir):
for file in subfiles:
if file.endswith('.txt') and file.find('-') == -1 :
if(file.find('Edu') == -1 ):
continue;
print('\tStart parsing file: ['+file+']');
f = codecs.open('/'.join([subroot, file]), 'r', 'utf-8');
string = f.read();
f.close();
text.String = string;
text.Tokenize();
text.Normalize(2);
text.ParseClitics();
print('\tProcessing...');
text.PatternMatchingSimpleStem();
print('\tWriting...');
xmlStreamWriter = io.StringIO();
text.RenderTextSimpleStem(xmlStreamWriter);
writer = codecs.open('/'.join([subroot, file.replace('.txt','-Qutuf.txt')]), 'w', 'utf-8');
writer.write(xmlStreamWriter.getvalue());
xmlStreamWriter.close();
writer.close();
print ('\tEnd parsing file: ['+file+']');
print('------------------------------------------------------');