k-m-irfan's picture
updated large files with lfs
9f1c059
raw
history blame
1.79 kB
#!/usr/bin/env python
# coding: utf-8
# In[2]:
############################################################
#Author : Bhagyashree
#Date : 1st Sept, 2020
#Purpose : Text Cleaning
#Input : Text file after timestamp removal
#Output : Text file after cleaning data
############################################################
# In[3]:
import nltk
import numpy
import xlrd
import openpyxl
import re
import sys
# In[21]:
#file1 = open("recent_deliverables_dec2020/Corporate_Law/Hindi/ankita objects 02_Hindi_new.txt","r+",encoding='utf-8')
file1 = open(sys.argv[1],"r+",encoding='utf-8')
data = file1.read()
#print(data)
file1.close()
# In[22]:
wb_obj = openpyxl.load_workbook(sys.argv[2])
sheet_obj = wb_obj.active
#data = re.sub('[A-Z]*', '',data)
#print(data)
data = data.replace('?','')
data = data.replace(' ',' ')
data = data.replace(';','')
data = data.replace(')','')
data = data.replace('(','')
data = data.replace('!','')
data = data.replace(' – ',' ')
data = data.replace('-',' ')
data = data.replace('।','')
data = data.replace('&','')
data = data.replace('’','')
data = data.replace('‘','')
data = data.replace(':','')
data = data.replace(',','')
data = data.replace('/','')
data = data.replace(',','')
data = data.replace('.','')
data = data.replace('|','')
m_row = sheet_obj.max_row
line = data
for i in range(1,m_row+1):
num = sheet_obj.cell(row = i, column = 1).value
word = sheet_obj.cell(row = i, column = 2).value
#print(num)
#print(word)
line = line.replace(str(num), word)
#print(line)
#' '.join(line.split())
#print(line)
file1 = open(sys.argv[3],"w+",encoding='utf-8')
#file1 = open("recent_deliverables_dec2020/Corporate_Law/Hindi/ankita objects 02_Hindi.txt","w+",encoding='utf-8')
file1.write(line)
file1.close()