patrickvonplaten commited on
Commit
7d2f336
1 Parent(s): 9fa6437
Files changed (2) hide show
  1. app.py +139 -3
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,7 +1,143 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  iface.launch()
 
1
  import gradio as gr
2
+ import pypandoc
3
+ import glob
4
+ import shutil
5
+ import os
6
+ import tqdm
7
+ import tempfile
8
+ import re
9
+ from pdfminer.high_level import extract_text
10
 
11
+ #from docx import Document
12
+ #document = Document()
13
+ #document.add_heading('Labels for ', level=1)
14
+ RESULTS_FOLDER = "./results"
15
 
16
+ CAT_TO_CODEWORDS = {
17
+ "Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
18
+ "Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
19
+ "Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
20
+ "Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
21
+ "Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
22
+ "Courage": ["courage", "brave", "dare", "step", "determine"],
23
+ "Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
24
+ "Coherence": ["coherent", "align", "incoherent", "consistent"],
25
+ "Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
26
+ "Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
27
+ "Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
28
+ "Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
29
+ "Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
30
+ "The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
31
+ }
32
+
33
+
34
+ CAT_TO_CODEWORDS = {
35
+ "Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
36
+ "Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
37
+ "Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
38
+ "Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
39
+ "Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
40
+ "Courage": ["courage", "brave", "dare", "step", "determine"],
41
+ "Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
42
+ "Coherence": ["coherent", "align", "incoherent", "consistent"],
43
+ "Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
44
+ "Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
45
+ "Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
46
+ "Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
47
+ "Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
48
+ "The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
49
+ }
50
+
51
+ CATEGORIES = CAT_TO_CODEWORDS.keys()
52
+
53
+ def retrieve_lines(filename):
54
+ extension = filename.split(".")[-1]
55
+
56
+ if extension == "pdf":
57
+ text = extract_text(filename)
58
+ lines = text.split("\n")
59
+ elif extension in ["docx", "doc"]:
60
+ with tempfile.TemporaryDirectory() as tmpdirname:
61
+ outfile = os.path.join(tmpdirname, "temp.txt")
62
+ pypandoc.convert_file(filename, 'plain', outputfile=outfile)
63
+ with open(outfile, "r") as f:
64
+ lines = f.readlines()
65
+
66
+ lines = [l.strip() for l in lines]
67
+
68
+ lines = " ".join(lines)
69
+ lines = lines.split(".")
70
+
71
+ return lines
72
+
73
+ def match_code(lines, codewords):
74
+ match_dict = {}
75
+ keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
76
+ for i, _ in enumerate(lines):
77
+ line = lines[i]
78
+ matches = list(keywords_to_match.finditer(line))
79
+
80
+ if len(matches) > 0:
81
+ for m in matches:
82
+ span = m.span()
83
+ line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
84
+
85
+ match_dict[i] = " ".join(line.rstrip().lstrip().split())
86
+
87
+ return match_dict
88
+
89
+ def main(filename, codewords_mapping):
90
+ lines = retrieve_lines(filename)
91
+
92
+ for label, codewords in codewords_mapping.items():
93
+ match = match_code(lines, codewords)
94
+
95
+ out = ""
96
+ if len(match) > 0:
97
+ result_file = ".".join(['_'.join(label.split()), "result", "txt"])
98
+ result_file = os.path.join(RESULTS_FOLDER, result_file)
99
+ if not os.path.exists(result_file):
100
+ out += f"# Code: {label}\n"
101
+ out += 25 * "="
102
+ out += "\n\n"
103
+
104
+ out += f"## Source: {filename}\n"
105
+ out += 25 * "-"
106
+ out += "\n"
107
+ out += "\n".join([f'-{v}' for k,v in match.items()])
108
+ out += "\n"
109
+ out += 25 * "-"
110
+ out += "\n\n"
111
+
112
+ with open(result_file, "a") as f:
113
+ f.write(out)
114
+
115
+ def convert(*keywords):
116
+ codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
117
+
118
+ num_files = 0
119
+
120
+ print(codewords_mapping)
121
+
122
+ return "Yes"
123
+
124
+ for folder in tqdm.tqdm(glob.glob("./*")):
125
+ shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
126
+ os.makedirs(RESULTS_FOLDER)
127
+
128
+ all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
129
+ num_files += len(all_files)
130
+
131
+ for filename in all_files:
132
+ try:
133
+ main(filename)
134
+ except Exception as e:
135
+ print(f"{filename} not working because \n {e}")
136
+
137
+ return f"Retrieved from {num_files}"
138
+
139
+ inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholders=CAT_TO_CODEWORDS[k]) for k in CATEGORIES]
140
+
141
+ iface = gr.Interface(
142
+ fn=greet, inputs=inputs, outputs="text")
143
  iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pypandoc
2
+ pdfminer