Sa-m commited on
Commit
a87bc00
1 Parent(s): b3f8933

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """trial _final yr proj.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1AGAk7En1Rd0RuEju4MzMxSCUVnGq73Es
8
+ """
9
+
10
+
11
+
12
+ """# MANIFESTO ANALYSIS
13
+
14
+ ## IMPORTING LIBRARIES
15
+ """
16
+
17
+ # Commented out IPython magic to ensure Python compatibility.
18
+ # %%capture
19
+ # !pip install tika
20
+ # !pip install clean-text
21
+ # !pip install gradio
22
+
23
+ # Commented out IPython magic to ensure Python compatibility.
24
+
25
+ import io
26
+ import random
27
+ import matplotlib.pyplot as plt
28
+ import nltk
29
+ from nltk.tokenize import word_tokenize,sent_tokenize
30
+ from nltk.corpus import stopwords
31
+ from nltk.stem.porter import PorterStemmer
32
+ from nltk.stem import WordNetLemmatizer
33
+ from tika import parser
34
+ from nltk.corpus import stopwords
35
+ from nltk.tokenize import word_tokenize
36
+ from nltk.probability import FreqDist
37
+ from cleantext import clean
38
+
39
+
40
+ import nltk.corpus
41
+ from nltk.text import Text
42
+ from io import StringIO
43
+ import sys
44
+
45
+ import re
46
+
47
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
48
+ from textblob import TextBlob
49
+ from PIL import Image
50
+
51
+ import gradio as gr
52
+ from zipfile import ZipFile
53
+
54
+
55
+ nltk.download('stopwords')
56
+ nltk.download('punkt')
57
+ nltk.download('wordnet')
58
+
59
+
60
+
61
+
62
+ """## PARSING FILES"""
63
+
64
+ def Parsing(parsed_text):
65
+ parsed_text=parsed_text.name
66
+ raw_party =parser.from_file(parsed_text)
67
+ # parser.parse1(option='all',urlOrPath=parsed_text)
68
+ # from_buffer(parsed_text)
69
+ # from_file(parsed_text)
70
+ raw_party = raw_party['content']
71
+ return clean(raw_party)
72
+
73
+
74
+
75
+ #Added more stopwords to avoid irrelevant terms
76
+ stop_words = set(stopwords.words('english'))
77
+ stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2')
78
+
79
+ """## PREPROCESSING"""
80
+
81
+ def clean_text(text):
82
+ '''
83
+ Function which returns clean text
84
+ '''
85
+ text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
86
+ text = re.sub(r"\n", " ", text)
87
+ text = re.sub(r"\n\n", " ", text)
88
+ text = re.sub(r"\t", " ", text)
89
+ text = re.sub(r"/ ", " ", text)
90
+ text = text.strip(" ")
91
+ text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
92
+
93
+ text = [word for word in text.split() if word not in STOPWORDS]
94
+ text = ' '.join(text)
95
+ return text
96
+
97
+ # text_Party=clean_text(raw_party)
98
+
99
+ def Preprocess(textParty):
100
+ '''
101
+ Removing special characters extra spaces
102
+ '''
103
+ text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty)
104
+ #Removing all stop words
105
+ pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
106
+ text2Party = pattern.sub('', text1Party)
107
+ # fdist_cong = FreqDist(word_tokens_cong)
108
+ return text2Party
109
+
110
+
111
+
112
+
113
+
114
+ # Using Concordance,you can see each time a word is used, along with its
115
+ # immediate context. It can give you a peek into how a word is being used
116
+ # at the sentence level and what words are used with it.
117
+
118
+ def concordance(text_Party,strng):
119
+ word_tokens_party = word_tokenize(text_Party)
120
+ moby = Text(word_tokens_party)
121
+ resultList = []
122
+ for i in range(0,1):
123
+ save_stdout = sys.stdout
124
+ result = StringIO()
125
+ sys.stdout = result
126
+ moby.concordance(strng,lines=10,width=82)
127
+ sys.stdout = save_stdout
128
+ s=result.getvalue().splitlines()
129
+ return result.getvalue()
130
+
131
+
132
+ def normalize(d, target=1.0):
133
+ raw = sum(d.values())
134
+ factor = target/raw
135
+ return {key:value*factor for key,value in d.items()}
136
+
137
+ def fDistance(text2Party):
138
+ '''
139
+ most frequent words search
140
+ '''
141
+ word_tokens_party = word_tokenize(text2Party) #Tokenizing
142
+ fdistance = FreqDist(word_tokens_party).most_common(10)
143
+ mem={}
144
+ for x in fdistance:
145
+ mem[x[0]]=x[1]
146
+ return normalize(mem)
147
+
148
+ def fDistancePlot(text2Party,plotN=20):
149
+ '''
150
+ most frequent words visualisation
151
+ '''
152
+ word_tokens_party = word_tokenize(text2Party) #Tokenizing
153
+ fdistance = FreqDist(word_tokens_party)
154
+ return fdistance.plot(20)
155
+
156
+
157
+
158
+ ## UI INTERFACE
159
+
160
+ def analysis(Manifesto,Search):
161
+ raw_party = Parsing(Manifesto)
162
+ text_Party=clean_text(raw_party)
163
+ text_Party= Preprocess(text_Party)
164
+ fdist_Party=fDistance(text_Party)
165
+ searchRes=concordance(text_Party,Search)
166
+ searChRes=clean(searchRes)
167
+ # searChRes=searchRes.replace(Search,f"\u0332{Search}\u0332 ")
168
+ searChRes=searchRes.replace(Search,"\u0332".join(Search))
169
+ return fdist_Party,searChRes
170
+
171
+
172
+ Search_txt=gr.inputs.Textbox()
173
+ filePdf = gr.inputs.File()
174
+ text = gr.outputs.Textbox(label='SEARCHED OUTPUT')
175
+ mfw=gr.outputs.Label(label="Most Relevant topics in manifesto")
176
+
177
+ gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[mfw,text], title='Manifesto Analysis').launch(debug=False,share=True)
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+