Spaces:
Runtime error
Runtime error
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon May 19 16:49:22 2025 | |
@author: jacobwildt-persson | |
""" | |
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# ----------------------------------------------- | |
# Requirements & Setup Instructions | |
# ----------------------------------------------- | |
# Python version: | |
# Requires Python 3.10 or later (tested on 3.12) | |
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. | |
# Recreate the environment with theese command in terminal | |
# conda env create -f environment.yml | |
# conda activate sprakenv | |
# | |
# Install all required packages: | |
# Run these commands in the terminal: | |
# pip install --upgrade gradio | |
# pip install pdfplumber | |
# pip install nltk | |
# pip install transformers | |
# pip install -U spacy | |
# Download language models: | |
# python -m spacy download es_core_news_lg | |
# python -m spacy download en_core_web_lg # if you add NER for English | |
# Check Gradio version used: | |
# import gradio as gr | |
# print(gr.__version__) # Gradio version 4.18.0 | |
# 🔗 Reference: Gradio Quickstart Guide | |
# https://www.gradio.app/guides/quickstart | |
#Hugging Face | |
# https://huggingface.co/models | |
# Enghlish API model | |
# LanguageTool API: https://languagetool.org/http-api/swagger | |
#Rembember !!!!!!!!!!!!!!!!!!!!!!!!! | |
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. | |
# Recreate the environment with theese command in terminal | |
# conda env create -f environment.yml | |
# conda activate sprakenv | |
# python -m spacy download es_core_news_lg | |
#python -m nltk.downloader punkt wordnet | |
# ----------------------------------------------- | |
""" | |
Language learning app with Gradio UI, on & multiple users: | |
- Import text from file (.txt/.csv/.pdf) or manual text input | |
- Grammar correction via transformers (Spanish) or LanguageTool API (English) | |
- Analyze text (known/unknown words) per user & language | |
- Save unknown words as known | |
- Generate coherent practice sentence (Spanish & English) | |
- Log grammar corrections and practice sentence suggestions to CSV | |
""" | |
import os | |
import datetime | |
import sqlite3 | |
import requests | |
import random | |
import pandas as pd | |
import pdfplumber | |
import spacy | |
import csv | |
# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary) | |
import sqlite3 | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM | |
import gradio as gr | |
import gradio_client.utils as _gcu | |
# --- PATCH for Gradio utils schema bug --- | |
_orig_json = _gcu.json_schema_to_python_type | |
_orig_get = _gcu.get_type | |
def _patched_json_to_py(schema, defs=None): | |
if not isinstance(schema, dict): | |
return "any" | |
try: | |
return _orig_json(schema, defs) | |
except Exception: | |
return "any" | |
def _patched_get_type(schema): | |
if not isinstance(schema, dict): | |
return "any" | |
try: | |
return _orig_get(schema) | |
except Exception: | |
return "any" | |
_gcu.json_schema_to_python_type = _patched_json_to_py | |
_gcu.get_type = _patched_get_type | |
# --- SQLite Database initialization --- | |
DB_NAME = "vocabulary.db" | |
conn = sqlite3.connect(DB_NAME) | |
conn.execute(""" | |
CREATE TABLE IF NOT EXISTS vocabulary ( | |
user_id TEXT, | |
language TEXT, | |
word TEXT, | |
timestamp TEXT, | |
UNIQUE(user_id, language, word) | |
) | |
""") | |
conn.commit() | |
conn.close() | |
# --- Save word to database --- | |
def save_word_to_db(user_id: str, language: str, word: str): | |
ts = datetime.datetime.now().isoformat() | |
conn = sqlite3.connect(DB_NAME) | |
conn.execute( | |
"INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)", | |
(user_id, language, word, ts) | |
) | |
conn.commit() | |
conn.close() | |
# --- Retrieve known words for user/language --- | |
def get_user_vocabulary(user_id: str, language: str) -> set[str]: | |
conn = sqlite3.connect(DB_NAME) | |
rows = conn.execute( | |
"SELECT word FROM vocabulary WHERE user_id=? AND language=?", | |
(user_id, language) | |
).fetchall() | |
conn.close() | |
return {r[0] for r in rows} | |
# --- Load NLP models --- | |
nlp = spacy.load("es_core_news_lg") | |
tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") | |
model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") | |
gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2") | |
gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2") | |
gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2") | |
gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2") | |
lemmatizer = WordNetLemmatizer() | |
# ---Log to CSV (grammar corrections and sentence suggestions) --- | |
def log_to_csv(filename, row, fieldnames): | |
file_exists = os.path.isfile(filename) | |
with open(filename, "a", newline='', encoding="utf-8") as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
if not file_exists: | |
writer.writeheader() | |
writer.writerow(row) | |
# --- File Import --- | |
def import_file(path: str) -> str: | |
ext = os.path.splitext(path)[1].lower() | |
if ext == ".pdf": | |
pages = [] | |
with pdfplumber.open(path) as pdf: | |
for p in pdf.pages: | |
pages.append(p.extract_text() or "") | |
return "\n".join(pages) | |
if ext == ".csv": | |
df = pd.read_csv(path) | |
if "text" in df: | |
return "\n".join(df["text"].astype(str)) | |
raise ValueError("CSV saknar kolumnen 'text'.") | |
if ext == ".txt": | |
return open(path, encoding="utf-8").read() | |
raise ValueError(f"Okänt filformat: {ext}") | |
# --- Grammar Correction --- | |
def correct_grammar(text: str, language: str) -> str: | |
if language == "es": | |
corrected = [] | |
for sent in nlp(text).sents: | |
s = sent.text.strip() | |
if not s: continue | |
inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True) | |
out = model.generate( | |
**inp, | |
max_new_tokens=inp.input_ids.shape[1], | |
num_beams=5, | |
early_stopping=True | |
) | |
corrected.append(tokenizer.decode(out[0], skip_special_tokens=True)) | |
return " ".join(corrected) | |
# English: LanguageTool API | |
resp = requests.post( | |
"https://api.languagetool.org/v2/check", | |
data={"text": text, "language": language} | |
).json() | |
for m in reversed(resp.get("matches", [])): | |
off, ln = m["offset"], m["length"] | |
repls = m.get("replacements", []) | |
val = repls[0]["value"] if repls else "" | |
text = text[:off] + val + text[off+ln:] | |
return text | |
# --- Analyze known and unknown words --- | |
def analyze_text(text: str, user_id: str, language: str): | |
toks = word_tokenize(text) | |
lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()] | |
vocab = get_user_vocabulary(user_id, language) | |
known = [w for w in lems if w in vocab] | |
unknown = [w for w in lems if w not in vocab] | |
return known, unknown | |
# --- Generate sentence using GPT2 based on unknown words --- | |
def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str: | |
kn, un = analyze_text(text, user_id, language) | |
if not un: | |
return "Inga okända ord att generera mening med." | |
chosen = random.sample(un, min(num_unknown, len(un))) | |
if language == "es": | |
prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "." | |
tokenizer = gpt2_tokenizer_es | |
model = gpt2_model_es | |
else: | |
prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "." | |
tokenizer = gpt2_tokenizer_en | |
model = gpt2_model_en | |
inp = tokenizer(prompt, return_tensors="pt", truncation=True) | |
outs = model.generate( | |
**inp, | |
max_new_tokens=50, | |
do_sample=True, | |
top_k=50, | |
top_p=0.95 | |
) | |
gen = tokenizer.decode(outs[0], skip_special_tokens=True) | |
body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip() | |
sentence = (body.split(".")[0].strip() + ".") if "." in body else body | |
if not any(c.isalpha() for c in sentence): | |
return "Misslyckades att generera meningsfull övningsmening." | |
return sentence | |
# --- Gradio process callback --- | |
def process(user, language, txt, file, do_grammar, do_save): | |
try: | |
if txt and txt.strip(): | |
text = txt.strip() | |
elif file: | |
text = import_file(file.name) | |
else: | |
return "", "", "", "Ingen text angiven.", "" | |
out = correct_grammar(text, language) if do_grammar else text | |
kn, un = analyze_text(out, user, language) | |
status = "" | |
if do_save and un: | |
for w in un: | |
save_word_to_db(user, language, w) | |
status = f"Sparade {len(un)} ord." | |
# Logga grammatikrättning till CSV | |
log_to_csv( | |
"grammarlog.csv", | |
{ | |
"user": user, "language": language, "input": text, | |
"output": out, "timestamp": datetime.datetime.now().isoformat() | |
}, | |
["user", "language", "input", "output", "timestamp"] | |
) | |
return out, ", ".join(kn), ", ".join(un), status, "" | |
except Exception as e: | |
import traceback | |
tb = traceback.format_exc() | |
return "", "", "", f"FEL i process:\n{tb}", "" | |
# --- Sentence generation callback --- | |
def coherent_fn(user, language, txt, num): | |
try: | |
suggestion = generate_coherent_sentence(txt or "", user, language, num) | |
# Logga övningsförslag till CSV | |
log_to_csv( | |
"sentencelog.csv", | |
{ | |
"user": user, "language": language, "input": txt, | |
"output": suggestion, "timestamp": datetime.datetime.now().isoformat() | |
}, | |
["user", "language", "input", "output", "timestamp"] | |
) | |
return suggestion | |
except Exception as e: | |
return f"Fel vid generering: {e}" | |
# --- Gradio UI --- | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet") | |
with gr.Row(): | |
user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här") | |
lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk") | |
with gr.Column(): | |
manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text") | |
file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil") | |
grammar_cb = gr.Checkbox(label="Grammatikrättning") | |
autosave_cb = gr.Checkbox(label="Spara okända ord") | |
run_btn = gr.Button("Kör analys & korrigering") | |
num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning") | |
coherent_btn = gr.Button("Koherent övningsmening") | |
corr_out = gr.Textbox(label="Korrigerad text", lines=4) | |
known_out = gr.Textbox(label="Kända ord") | |
unknown_out = gr.Textbox(label="Okända ord") | |
status_out = gr.Textbox(label="Status") | |
coherent_out = gr.Textbox(label="Koherent övningsmening") | |
# --- Knapparnas click‐kopplingar --- | |
run_btn.click( | |
fn=process, | |
inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb], | |
outputs=[corr_out, known_out, unknown_out, status_out, coherent_out] | |
) | |
coherent_btn.click( | |
fn=coherent_fn, | |
inputs=[user_input, lang_dd, manual_input, num_slider], | |
outputs=[coherent_out] | |
) | |
#Make sure to change language for the textfile to be analyzed in its target language | |
# --- Start app --- | |
if __name__ == "__main__": | |
url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True) | |
print("Appen körs på:", url) | |