Spaces:
Runtime error
Runtime error
patrickvonplaten
commited on
Commit
•
7d2f336
1
Parent(s):
9fa6437
finish
Browse files- app.py +139 -3
- requirements.txt +2 -0
app.py
CHANGED
@@ -1,7 +1,143 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import pypandoc
|
3 |
+
import glob
|
4 |
+
import shutil
|
5 |
+
import os
|
6 |
+
import tqdm
|
7 |
+
import tempfile
|
8 |
+
import re
|
9 |
+
from pdfminer.high_level import extract_text
|
10 |
|
11 |
+
#from docx import Document
|
12 |
+
#document = Document()
|
13 |
+
#document.add_heading('Labels for ', level=1)
|
14 |
+
RESULTS_FOLDER = "./results"
|
15 |
|
16 |
+
CAT_TO_CODEWORDS = {
|
17 |
+
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
|
18 |
+
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
|
19 |
+
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
|
20 |
+
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
|
21 |
+
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
|
22 |
+
"Courage": ["courage", "brave", "dare", "step", "determine"],
|
23 |
+
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
|
24 |
+
"Coherence": ["coherent", "align", "incoherent", "consistent"],
|
25 |
+
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
|
26 |
+
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
|
27 |
+
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
|
28 |
+
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
|
29 |
+
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
|
30 |
+
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
CAT_TO_CODEWORDS = {
|
35 |
+
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
|
36 |
+
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
|
37 |
+
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
|
38 |
+
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
|
39 |
+
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
|
40 |
+
"Courage": ["courage", "brave", "dare", "step", "determine"],
|
41 |
+
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
|
42 |
+
"Coherence": ["coherent", "align", "incoherent", "consistent"],
|
43 |
+
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
|
44 |
+
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
|
45 |
+
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
|
46 |
+
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
|
47 |
+
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
|
48 |
+
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
|
49 |
+
}
|
50 |
+
|
51 |
+
CATEGORIES = CAT_TO_CODEWORDS.keys()
|
52 |
+
|
53 |
+
def retrieve_lines(filename):
|
54 |
+
extension = filename.split(".")[-1]
|
55 |
+
|
56 |
+
if extension == "pdf":
|
57 |
+
text = extract_text(filename)
|
58 |
+
lines = text.split("\n")
|
59 |
+
elif extension in ["docx", "doc"]:
|
60 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
61 |
+
outfile = os.path.join(tmpdirname, "temp.txt")
|
62 |
+
pypandoc.convert_file(filename, 'plain', outputfile=outfile)
|
63 |
+
with open(outfile, "r") as f:
|
64 |
+
lines = f.readlines()
|
65 |
+
|
66 |
+
lines = [l.strip() for l in lines]
|
67 |
+
|
68 |
+
lines = " ".join(lines)
|
69 |
+
lines = lines.split(".")
|
70 |
+
|
71 |
+
return lines
|
72 |
+
|
73 |
+
def match_code(lines, codewords):
|
74 |
+
match_dict = {}
|
75 |
+
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
|
76 |
+
for i, _ in enumerate(lines):
|
77 |
+
line = lines[i]
|
78 |
+
matches = list(keywords_to_match.finditer(line))
|
79 |
+
|
80 |
+
if len(matches) > 0:
|
81 |
+
for m in matches:
|
82 |
+
span = m.span()
|
83 |
+
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
|
84 |
+
|
85 |
+
match_dict[i] = " ".join(line.rstrip().lstrip().split())
|
86 |
+
|
87 |
+
return match_dict
|
88 |
+
|
89 |
+
def main(filename, codewords_mapping):
|
90 |
+
lines = retrieve_lines(filename)
|
91 |
+
|
92 |
+
for label, codewords in codewords_mapping.items():
|
93 |
+
match = match_code(lines, codewords)
|
94 |
+
|
95 |
+
out = ""
|
96 |
+
if len(match) > 0:
|
97 |
+
result_file = ".".join(['_'.join(label.split()), "result", "txt"])
|
98 |
+
result_file = os.path.join(RESULTS_FOLDER, result_file)
|
99 |
+
if not os.path.exists(result_file):
|
100 |
+
out += f"# Code: {label}\n"
|
101 |
+
out += 25 * "="
|
102 |
+
out += "\n\n"
|
103 |
+
|
104 |
+
out += f"## Source: {filename}\n"
|
105 |
+
out += 25 * "-"
|
106 |
+
out += "\n"
|
107 |
+
out += "\n".join([f'-{v}' for k,v in match.items()])
|
108 |
+
out += "\n"
|
109 |
+
out += 25 * "-"
|
110 |
+
out += "\n\n"
|
111 |
+
|
112 |
+
with open(result_file, "a") as f:
|
113 |
+
f.write(out)
|
114 |
+
|
115 |
+
def convert(*keywords):
|
116 |
+
codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
|
117 |
+
|
118 |
+
num_files = 0
|
119 |
+
|
120 |
+
print(codewords_mapping)
|
121 |
+
|
122 |
+
return "Yes"
|
123 |
+
|
124 |
+
for folder in tqdm.tqdm(glob.glob("./*")):
|
125 |
+
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
|
126 |
+
os.makedirs(RESULTS_FOLDER)
|
127 |
+
|
128 |
+
all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
|
129 |
+
num_files += len(all_files)
|
130 |
+
|
131 |
+
for filename in all_files:
|
132 |
+
try:
|
133 |
+
main(filename)
|
134 |
+
except Exception as e:
|
135 |
+
print(f"{filename} not working because \n {e}")
|
136 |
+
|
137 |
+
return f"Retrieved from {num_files}"
|
138 |
+
|
139 |
+
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholders=CAT_TO_CODEWORDS[k]) for k in CATEGORIES]
|
140 |
+
|
141 |
+
iface = gr.Interface(
|
142 |
+
fn=greet, inputs=inputs, outputs="text")
|
143 |
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
pypandoc
|
2 |
+
pdfminer
|