Spaces:

JacobWP
/

language_app_Jacob_WP

Runtime error

App Files Files Community

language_app_Jacob_WP / app.py

JacobWP

Upload 2 files

0b721e9 verified 2 months ago

raw

history blame contribute delete

11.9 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Mon May 19 16:49:22 2025

	@author: jacobwildt-persson
	"""

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# -----------------------------------------------
	# Requirements & Setup Instructions
	# -----------------------------------------------

	# Python version:
	# Requires Python 3.10 or later (tested on 3.12)


	# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
	# Recreate the environment with theese command in terminal
	# conda env create -f environment.yml
	# conda activate sprakenv
	#

	# Install all required packages:
	# Run these commands in the terminal:

	# pip install --upgrade gradio
	# pip install pdfplumber
	# pip install nltk
	# pip install transformers
	# pip install -U spacy

	# Download language models:
	# python -m spacy download es_core_news_lg
	# python -m spacy download en_core_web_lg # if you add NER for English

	# Check Gradio version used:
	# import gradio as gr
	# print(gr.__version__) # Gradio version 4.18.0

	# 🔗 Reference: Gradio Quickstart Guide
	# https://www.gradio.app/guides/quickstart
	#Hugging Face
	# https://huggingface.co/models

	# Enghlish API model
	# LanguageTool API: https://languagetool.org/http-api/swagger



	#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
	# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
	# Recreate the environment with theese command in terminal
	# conda env create -f environment.yml
	# conda activate sprakenv
	# python -m spacy download es_core_news_lg
	#python -m nltk.downloader punkt wordnet
	# -----------------------------------------------
	"""
	Language learning app with Gradio UI, on & multiple users:
	- Import text from file (.txt/.csv/.pdf) or manual text input
	- Grammar correction via transformers (Spanish) or LanguageTool API (English)
	- Analyze text (known/unknown words) per user & language
	- Save unknown words as known
	- Generate coherent practice sentence (Spanish & English)
	- Log grammar corrections and practice sentence suggestions to CSV
	"""
	import os
	import datetime
	import sqlite3
	import requests
	import random
	import pandas as pd
	import pdfplumber
	import spacy
	import csv
	# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
	import sqlite3

	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
	import gradio as gr
	import gradio_client.utils as _gcu

	# --- PATCH for Gradio utils schema bug ---
	_orig_json = _gcu.json_schema_to_python_type
	_orig_get = _gcu.get_type

	def _patched_json_to_py(schema, defs=None):
	if not isinstance(schema, dict):
	return "any"
	try:
	return _orig_json(schema, defs)
	except Exception:
	return "any"

	def _patched_get_type(schema):
	if not isinstance(schema, dict):
	return "any"
	try:
	return _orig_get(schema)
	except Exception:
	return "any"

	_gcu.json_schema_to_python_type = _patched_json_to_py
	_gcu.get_type = _patched_get_type

	# --- SQLite Database initialization ---
	DB_NAME = "vocabulary.db"
	conn = sqlite3.connect(DB_NAME)
	conn.execute("""
	CREATE TABLE IF NOT EXISTS vocabulary (
	user_id TEXT,
	language TEXT,
	word TEXT,
	timestamp TEXT,
	UNIQUE(user_id, language, word)
	)
	""")
	conn.commit()
	conn.close()

	# --- Save word to database ---
	def save_word_to_db(user_id: str, language: str, word: str):
	ts = datetime.datetime.now().isoformat()
	conn = sqlite3.connect(DB_NAME)
	conn.execute(
	"INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
	(user_id, language, word, ts)
	)
	conn.commit()
	conn.close()

	# --- Retrieve known words for user/language ---
	def get_user_vocabulary(user_id: str, language: str) -> set[str]:
	conn = sqlite3.connect(DB_NAME)
	rows = conn.execute(
	"SELECT word FROM vocabulary WHERE user_id=? AND language=?",
	(user_id, language)
	).fetchall()
	conn.close()
	return {r[0] for r in rows}

	# --- Load NLP models ---
	nlp = spacy.load("es_core_news_lg")
	tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
	model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
	gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
	gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
	gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
	gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2")
	lemmatizer = WordNetLemmatizer()

	# ---Log to CSV (grammar corrections and sentence suggestions) ---
	def log_to_csv(filename, row, fieldnames):
	file_exists = os.path.isfile(filename)
	with open(filename, "a", newline='', encoding="utf-8") as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	if not file_exists:
	writer.writeheader()
	writer.writerow(row)

	# --- File Import ---
	def import_file(path: str) -> str:
	ext = os.path.splitext(path)[1].lower()
	if ext == ".pdf":
	pages = []
	with pdfplumber.open(path) as pdf:
	for p in pdf.pages:
	pages.append(p.extract_text() or "")
	return "\n".join(pages)
	if ext == ".csv":
	df = pd.read_csv(path)
	if "text" in df:
	return "\n".join(df["text"].astype(str))
	raise ValueError("CSV saknar kolumnen 'text'.")
	if ext == ".txt":
	return open(path, encoding="utf-8").read()
	raise ValueError(f"Okänt filformat: {ext}")

	# --- Grammar Correction ---

	def correct_grammar(text: str, language: str) -> str:
	if language == "es":
	corrected = []
	for sent in nlp(text).sents:
	s = sent.text.strip()
	if not s: continue
	inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
	out = model.generate(
	**inp,
	max_new_tokens=inp.input_ids.shape[1],
	num_beams=5,
	early_stopping=True
	)
	corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
	return " ".join(corrected)
	# English: LanguageTool API
	resp = requests.post(
	"https://api.languagetool.org/v2/check",
	data={"text": text, "language": language}
	).json()
	for m in reversed(resp.get("matches", [])):
	off, ln = m["offset"], m["length"]
	repls = m.get("replacements", [])
	val = repls[0]["value"] if repls else ""
	text = text[:off] + val + text[off+ln:]
	return text

	# --- Analyze known and unknown words ---

	def analyze_text(text: str, user_id: str, language: str):
	toks = word_tokenize(text)
	lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
	vocab = get_user_vocabulary(user_id, language)
	known = [w for w in lems if w in vocab]
	unknown = [w for w in lems if w not in vocab]
	return known, unknown
	# --- Generate sentence using GPT2 based on unknown words ---
	def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
	kn, un = analyze_text(text, user_id, language)
	if not un:
	return "Inga okända ord att generera mening med."
	chosen = random.sample(un, min(num_unknown, len(un)))
	if language == "es":
	prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
	tokenizer = gpt2_tokenizer_es
	model = gpt2_model_es
	else:
	prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
	tokenizer = gpt2_tokenizer_en
	model = gpt2_model_en
	inp = tokenizer(prompt, return_tensors="pt", truncation=True)
	outs = model.generate(
	**inp,
	max_new_tokens=50,
	do_sample=True,
	top_k=50,
	top_p=0.95
	)
	gen = tokenizer.decode(outs[0], skip_special_tokens=True)
	body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
	sentence = (body.split(".")[0].strip() + ".") if "." in body else body
	if not any(c.isalpha() for c in sentence):
	return "Misslyckades att generera meningsfull övningsmening."
	return sentence


	# --- Gradio process callback ---
	def process(user, language, txt, file, do_grammar, do_save):
	try:
	if txt and txt.strip():
	text = txt.strip()
	elif file:
	text = import_file(file.name)
	else:
	return "", "", "", "Ingen text angiven.", ""
	out = correct_grammar(text, language) if do_grammar else text
	kn, un = analyze_text(out, user, language)
	status = ""
	if do_save and un:
	for w in un:
	save_word_to_db(user, language, w)
	status = f"Sparade {len(un)} ord."
	# Logga grammatikrättning till CSV
	log_to_csv(
	"grammarlog.csv",
	{
	"user": user, "language": language, "input": text,
	"output": out, "timestamp": datetime.datetime.now().isoformat()
	},
	["user", "language", "input", "output", "timestamp"]
	)
	return out, ", ".join(kn), ", ".join(un), status, ""
	except Exception as e:
	import traceback
	tb = traceback.format_exc()
	return "", "", "", f"FEL i process:\n{tb}", ""

	# --- Sentence generation callback ---
	def coherent_fn(user, language, txt, num):
	try:
	suggestion = generate_coherent_sentence(txt or "", user, language, num)
	# Logga övningsförslag till CSV
	log_to_csv(
	"sentencelog.csv",
	{
	"user": user, "language": language, "input": txt,
	"output": suggestion, "timestamp": datetime.datetime.now().isoformat()
	},
	["user", "language", "input", "output", "timestamp"]
	)
	return suggestion
	except Exception as e:
	return f"Fel vid generering: {e}"

	# --- Gradio UI ---
	demo = gr.Blocks()
	with demo:
	gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
	with gr.Row():
	user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
	lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
	with gr.Column():
	manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
	file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
	grammar_cb = gr.Checkbox(label="Grammatikrättning")
	autosave_cb = gr.Checkbox(label="Spara okända ord")
	run_btn = gr.Button("Kör analys & korrigering")
	num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
	coherent_btn = gr.Button("Koherent övningsmening")

	corr_out = gr.Textbox(label="Korrigerad text", lines=4)
	known_out = gr.Textbox(label="Kända ord")
	unknown_out = gr.Textbox(label="Okända ord")
	status_out = gr.Textbox(label="Status")
	coherent_out = gr.Textbox(label="Koherent övningsmening")

	# --- Knapparnas click‐kopplingar ---
	run_btn.click(
	fn=process,
	inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
	outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
	)
	coherent_btn.click(
	fn=coherent_fn,
	inputs=[user_input, lang_dd, manual_input, num_slider],
	outputs=[coherent_out]
	)
	#Make sure to change language for the textfile to be analyzed in its target language

	# --- Start app ---
	if __name__ == "__main__":
	url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
	print("Appen körs på:", url)