Spaces:
Runtime error
Runtime error
Merge branch 'main' of https://huggingface.co/spaces/patrickvonplaten/atlas_app
a5f0c8c
import gradio as gr | |
import pypandoc | |
import glob | |
import shutil | |
import os | |
import tqdm | |
from huggingface_hub import snapshot_download | |
from huggingface_hub import HfApi | |
import tempfile | |
import re | |
from pdfminer.high_level import extract_text | |
import time | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
api = HfApi() | |
#from docx import Document | |
#document = Document() | |
#document.add_heading('Labels for ', level=1) | |
RESULTS_FOLDER = "./results" | |
DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset") | |
CAT_TO_CODEWORDS = { | |
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"], | |
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"], | |
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"], | |
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"], | |
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"], | |
"Courage": ["courage", "brave", "dare", "step", "determine"], | |
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"], | |
"Coherence": ["coherent", "align", "incoherent", "consistent"], | |
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"], | |
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"], | |
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"], | |
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"], | |
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"], | |
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"], | |
} | |
CATEGORIES = CAT_TO_CODEWORDS.keys() | |
def retrieve_lines(filename): | |
extension = filename.split(".")[-1] | |
if extension == "pdf": | |
text = extract_text(filename) | |
lines = text.split("\n") | |
elif extension in ["docx", "doc"]: | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
outfile = os.path.join(tmpdirname, "temp.txt") | |
pypandoc.convert_file(filename, 'plain', outputfile=outfile) | |
with open(outfile, "r") as f: | |
lines = f.readlines() | |
lines = [l.strip() for l in lines] | |
lines = " ".join(lines) | |
lines = lines.split(".") | |
return lines | |
def match_code(lines, codewords): | |
match_dict = {} | |
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b') | |
for i, _ in enumerate(lines): | |
line = lines[i] | |
matches = list(keywords_to_match.finditer(line)) | |
if len(matches) > 0: | |
for m in matches: | |
span = m.span() | |
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:] | |
match_dict[i] = " ".join(line.rstrip().lstrip().split()) | |
return match_dict | |
def main(filename, codewords_mapping): | |
lines = retrieve_lines(filename) | |
files = [] | |
for label, codewords in codewords_mapping.items(): | |
match = match_code(lines, codewords) | |
out = "" | |
if len(match) > 0: | |
result_file = ".".join(['_'.join(label.split()), "result", "txt"]) | |
result_file = os.path.join(RESULTS_FOLDER, result_file) | |
if not os.path.exists(result_file): | |
out += f"# Code: {label}\n" | |
out += 25 * "=" | |
out += "\n\n" | |
out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n" | |
out += 25 * "-" | |
out += "\n" | |
out += "\n".join([f'-{v}' for k,v in match.items()]) | |
out += "\n" | |
out += 25 * "-" | |
out += "\n\n" | |
with open(result_file, "a") as f: | |
f.write(out) | |
files.append(result_file) | |
return files | |
def convert(*keywords): | |
codewords_mapping = {k: v.split(",") for k,v in zip(CATEGORIES, keywords)} | |
num_files = 0 | |
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True) | |
os.makedirs(RESULTS_FOLDER) | |
result_files = [] | |
folders = glob.glob(os.path.join(DOC_FOLDER, "*")) | |
for folder in tqdm.tqdm(folders): | |
all_files = tqdm.tqdm(glob.glob(f"{folder}/*")) | |
num_files += len(all_files) | |
for filename in all_files: | |
try: | |
result_files += main(filename, codewords_mapping) | |
except Exception as e: | |
print(f"{filename} not working because \n {e}") | |
result_files = list(set(result_files)) | |
api.upload_folder( | |
repo_id="patrickvonplaten/atlas", | |
folder_path=RESULTS_FOLDER, | |
path_in_repo=f"results_{time.time()}", | |
repo_type="dataset", | |
token=HF_TOKEN, | |
) | |
return f"Done. Processed {num_files} files." | |
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES] | |
iface = gr.Interface( | |
fn=convert, inputs=inputs, outputs="text") | |
iface.launch() | |