#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 19 16:49:22 2025 @author: jacobwildt-persson """ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # ----------------------------------------------- # Requirements & Setup Instructions # ----------------------------------------------- # Python version: # Requires Python 3.10 or later (tested on 3.12) # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. # Recreate the environment with theese command in terminal # conda env create -f environment.yml # conda activate sprakenv # # Install all required packages: # Run these commands in the terminal: # pip install --upgrade gradio # pip install pdfplumber # pip install nltk # pip install transformers # pip install -U spacy # Download language models: # python -m spacy download es_core_news_lg # python -m spacy download en_core_web_lg # if you add NER for English # Check Gradio version used: # import gradio as gr # print(gr.__version__) # Gradio version 4.18.0 # 🔗 Reference: Gradio Quickstart Guide # https://www.gradio.app/guides/quickstart #Hugging Face # https://huggingface.co/models # Enghlish API model # LanguageTool API: https://languagetool.org/http-api/swagger #Rembember !!!!!!!!!!!!!!!!!!!!!!!!! # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts. # Recreate the environment with theese command in terminal # conda env create -f environment.yml # conda activate sprakenv # python -m spacy download es_core_news_lg #python -m nltk.downloader punkt wordnet # ----------------------------------------------- """ Language learning app with Gradio UI, on & multiple users: - Import text from file (.txt/.csv/.pdf) or manual text input - Grammar correction via transformers (Spanish) or LanguageTool API (English) - Analyze text (known/unknown words) per user & language - Save unknown words as known - Generate coherent practice sentence (Spanish & English) - Log grammar corrections and practice sentence suggestions to CSV """ import os import datetime import sqlite3 import requests import random import pandas as pd import pdfplumber import spacy import csv # SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary) import sqlite3 from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM import gradio as gr import gradio_client.utils as _gcu # --- PATCH for Gradio utils schema bug --- _orig_json = _gcu.json_schema_to_python_type _orig_get = _gcu.get_type def _patched_json_to_py(schema, defs=None): if not isinstance(schema, dict): return "any" try: return _orig_json(schema, defs) except Exception: return "any" def _patched_get_type(schema): if not isinstance(schema, dict): return "any" try: return _orig_get(schema) except Exception: return "any" _gcu.json_schema_to_python_type = _patched_json_to_py _gcu.get_type = _patched_get_type # --- SQLite Database initialization --- DB_NAME = "vocabulary.db" conn = sqlite3.connect(DB_NAME) conn.execute(""" CREATE TABLE IF NOT EXISTS vocabulary ( user_id TEXT, language TEXT, word TEXT, timestamp TEXT, UNIQUE(user_id, language, word) ) """) conn.commit() conn.close() # --- Save word to database --- def save_word_to_db(user_id: str, language: str, word: str): ts = datetime.datetime.now().isoformat() conn = sqlite3.connect(DB_NAME) conn.execute( "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)", (user_id, language, word, ts) ) conn.commit() conn.close() # --- Retrieve known words for user/language --- def get_user_vocabulary(user_id: str, language: str) -> set[str]: conn = sqlite3.connect(DB_NAME) rows = conn.execute( "SELECT word FROM vocabulary WHERE user_id=? AND language=?", (user_id, language) ).fetchall() conn.close() return {r[0] for r in rows} # --- Load NLP models --- nlp = spacy.load("es_core_news_lg") tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H") gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2") gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2") gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2") gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2") lemmatizer = WordNetLemmatizer() # ---Log to CSV (grammar corrections and sentence suggestions) --- def log_to_csv(filename, row, fieldnames): file_exists = os.path.isfile(filename) with open(filename, "a", newline='', encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if not file_exists: writer.writeheader() writer.writerow(row) # --- File Import --- def import_file(path: str) -> str: ext = os.path.splitext(path)[1].lower() if ext == ".pdf": pages = [] with pdfplumber.open(path) as pdf: for p in pdf.pages: pages.append(p.extract_text() or "") return "\n".join(pages) if ext == ".csv": df = pd.read_csv(path) if "text" in df: return "\n".join(df["text"].astype(str)) raise ValueError("CSV saknar kolumnen 'text'.") if ext == ".txt": return open(path, encoding="utf-8").read() raise ValueError(f"Okänt filformat: {ext}") # --- Grammar Correction --- def correct_grammar(text: str, language: str) -> str: if language == "es": corrected = [] for sent in nlp(text).sents: s = sent.text.strip() if not s: continue inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True) out = model.generate( **inp, max_new_tokens=inp.input_ids.shape[1], num_beams=5, early_stopping=True ) corrected.append(tokenizer.decode(out[0], skip_special_tokens=True)) return " ".join(corrected) # English: LanguageTool API resp = requests.post( "https://api.languagetool.org/v2/check", data={"text": text, "language": language} ).json() for m in reversed(resp.get("matches", [])): off, ln = m["offset"], m["length"] repls = m.get("replacements", []) val = repls[0]["value"] if repls else "" text = text[:off] + val + text[off+ln:] return text # --- Analyze known and unknown words --- def analyze_text(text: str, user_id: str, language: str): toks = word_tokenize(text) lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()] vocab = get_user_vocabulary(user_id, language) known = [w for w in lems if w in vocab] unknown = [w for w in lems if w not in vocab] return known, unknown # --- Generate sentence using GPT2 based on unknown words --- def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str: kn, un = analyze_text(text, user_id, language) if not un: return "Inga okända ord att generera mening med." chosen = random.sample(un, min(num_unknown, len(un))) if language == "es": prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "." tokenizer = gpt2_tokenizer_es model = gpt2_model_es else: prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "." tokenizer = gpt2_tokenizer_en model = gpt2_model_en inp = tokenizer(prompt, return_tensors="pt", truncation=True) outs = model.generate( **inp, max_new_tokens=50, do_sample=True, top_k=50, top_p=0.95 ) gen = tokenizer.decode(outs[0], skip_special_tokens=True) body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip() sentence = (body.split(".")[0].strip() + ".") if "." in body else body if not any(c.isalpha() for c in sentence): return "Misslyckades att generera meningsfull övningsmening." return sentence # --- Gradio process callback --- def process(user, language, txt, file, do_grammar, do_save): try: if txt and txt.strip(): text = txt.strip() elif file: text = import_file(file.name) else: return "", "", "", "Ingen text angiven.", "" out = correct_grammar(text, language) if do_grammar else text kn, un = analyze_text(out, user, language) status = "" if do_save and un: for w in un: save_word_to_db(user, language, w) status = f"Sparade {len(un)} ord." # Logga grammatikrättning till CSV log_to_csv( "grammarlog.csv", { "user": user, "language": language, "input": text, "output": out, "timestamp": datetime.datetime.now().isoformat() }, ["user", "language", "input", "output", "timestamp"] ) return out, ", ".join(kn), ", ".join(un), status, "" except Exception as e: import traceback tb = traceback.format_exc() return "", "", "", f"FEL i process:\n{tb}", "" # --- Sentence generation callback --- def coherent_fn(user, language, txt, num): try: suggestion = generate_coherent_sentence(txt or "", user, language, num) # Logga övningsförslag till CSV log_to_csv( "sentencelog.csv", { "user": user, "language": language, "input": txt, "output": suggestion, "timestamp": datetime.datetime.now().isoformat() }, ["user", "language", "input", "output", "timestamp"] ) return suggestion except Exception as e: return f"Fel vid generering: {e}" # --- Gradio UI --- demo = gr.Blocks() with demo: gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet") with gr.Row(): user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här") lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk") with gr.Column(): manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text") file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil") grammar_cb = gr.Checkbox(label="Grammatik­rättning") autosave_cb = gr.Checkbox(label="Spara okända ord") run_btn = gr.Button("Kör analys & korrigering") num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning") coherent_btn = gr.Button("Koherent övningsmening") corr_out = gr.Textbox(label="Korrigerad text", lines=4) known_out = gr.Textbox(label="Kända ord") unknown_out = gr.Textbox(label="Okända ord") status_out = gr.Textbox(label="Status") coherent_out = gr.Textbox(label="Koherent övningsmening") # --- Knapparnas click‐kopplingar --- run_btn.click( fn=process, inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb], outputs=[corr_out, known_out, unknown_out, status_out, coherent_out] ) coherent_btn.click( fn=coherent_fn, inputs=[user_input, lang_dd, manual_input, num_slider], outputs=[coherent_out] ) #Make sure to change language for the textfile to be analyzed in its target language # --- Start app --- if __name__ == "__main__": url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True) print("Appen körs på:", url)