Spaces:

patrickvonplaten
/

atlas_app

Runtime error

App Files Files Community

atlas_app / app.py

patrickvonplaten

Merge branch 'main' of https://huggingface.co/spaces/patrickvonplaten/atlas_app

a5f0c8c over 1 year ago

raw

history blame contribute delete

5.68 kB

	import gradio as gr
	import pypandoc
	import glob
	import shutil
	import os
	import tqdm
	from huggingface_hub import snapshot_download
	from huggingface_hub import HfApi
	import tempfile
	import re
	from pdfminer.high_level import extract_text
	import time

	HF_TOKEN = os.environ.get("HF_TOKEN")

	api = HfApi()


	#from docx import Document
	#document = Document()
	#document.add_heading('Labels for ', level=1)
	RESULTS_FOLDER = "./results"

	DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset")

	CAT_TO_CODEWORDS = {
	"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
	"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
	"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
	"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
	"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
	"Courage": ["courage", "brave", "dare", "step", "determine"],
	"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
	"Coherence": ["coherent", "align", "incoherent", "consistent"],
	"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
	"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
	"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
	"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
	"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
	"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
	}

	CATEGORIES = CAT_TO_CODEWORDS.keys()

	def retrieve_lines(filename):
	extension = filename.split(".")[-1]

	if extension == "pdf":
	text = extract_text(filename)
	lines = text.split("\n")
	elif extension in ["docx", "doc"]:
	with tempfile.TemporaryDirectory() as tmpdirname:
	outfile = os.path.join(tmpdirname, "temp.txt")
	pypandoc.convert_file(filename, 'plain', outputfile=outfile)
	with open(outfile, "r") as f:
	lines = f.readlines()

	lines = [l.strip() for l in lines]

	lines = " ".join(lines)
	lines = lines.split(".")

	return lines

	def match_code(lines, codewords):
	match_dict = {}
	keywords_to_match = re.compile(fr'\b(?:{"\|".join(codewords)})\b')
	for i, _ in enumerate(lines):
	line = lines[i]
	matches = list(keywords_to_match.finditer(line))

	if len(matches) > 0:
	for m in matches:
	span = m.span()
	line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]

	match_dict[i] = " ".join(line.rstrip().lstrip().split())

	return match_dict

	def main(filename, codewords_mapping):
	lines = retrieve_lines(filename)
	files = []

	for label, codewords in codewords_mapping.items():
	match = match_code(lines, codewords)

	out = ""
	if len(match) > 0:
	result_file = ".".join(['_'.join(label.split()), "result", "txt"])
	result_file = os.path.join(RESULTS_FOLDER, result_file)
	if not os.path.exists(result_file):
	out += f"# Code: {label}\n"
	out += 25 * "="
	out += "\n\n"

	out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n"
	out += 25 * "-"
	out += "\n"
	out += "\n".join([f'-{v}' for k,v in match.items()])
	out += "\n"
	out += 25 * "-"
	out += "\n\n"

	with open(result_file, "a") as f:
	f.write(out)

	files.append(result_file)

	return files


	def convert(*keywords):
	codewords_mapping = {k: v.split(",") for k,v in zip(CATEGORIES, keywords)}

	num_files = 0

	shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
	os.makedirs(RESULTS_FOLDER)

	result_files = []
	folders = glob.glob(os.path.join(DOC_FOLDER, "*"))

	for folder in tqdm.tqdm(folders):
	all_files = tqdm.tqdm(glob.glob(f"{folder}/*"))
	num_files += len(all_files)

	for filename in all_files:
	try:
	result_files += main(filename, codewords_mapping)
	except Exception as e:
	print(f"{filename} not working because \n {e}")

	result_files = list(set(result_files))

	api.upload_folder(
	repo_id="patrickvonplaten/atlas",
	folder_path=RESULTS_FOLDER,
	path_in_repo=f"results_{time.time()}",
	repo_type="dataset",
	token=HF_TOKEN,
	)

	return f"Done. Processed {num_files} files."


	inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES]

	iface = gr.Interface(
	fn=convert, inputs=inputs, outputs="text")
	iface.launch()