JacobWP's picture
Upload 2 files
0b721e9 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 16:49:22 2025
@author: jacobwildt-persson
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -----------------------------------------------
# Requirements & Setup Instructions
# -----------------------------------------------
# Python version:
# Requires Python 3.10 or later (tested on 3.12)
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
#
# Install all required packages:
# Run these commands in the terminal:
# pip install --upgrade gradio
# pip install pdfplumber
# pip install nltk
# pip install transformers
# pip install -U spacy
# Download language models:
# python -m spacy download es_core_news_lg
# python -m spacy download en_core_web_lg # if you add NER for English
# Check Gradio version used:
# import gradio as gr
# print(gr.__version__) # Gradio version 4.18.0
# 🔗 Reference: Gradio Quickstart Guide
# https://www.gradio.app/guides/quickstart
#Hugging Face
# https://huggingface.co/models
# Enghlish API model
# LanguageTool API: https://languagetool.org/http-api/swagger
#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
# python -m spacy download es_core_news_lg
#python -m nltk.downloader punkt wordnet
# -----------------------------------------------
"""
Language learning app with Gradio UI, on & multiple users:
- Import text from file (.txt/.csv/.pdf) or manual text input
- Grammar correction via transformers (Spanish) or LanguageTool API (English)
- Analyze text (known/unknown words) per user & language
- Save unknown words as known
- Generate coherent practice sentence (Spanish & English)
- Log grammar corrections and practice sentence suggestions to CSV
"""
import os
import datetime
import sqlite3
import requests
import random
import pandas as pd
import pdfplumber
import spacy
import csv
# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
import sqlite3
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
import gradio as gr
import gradio_client.utils as _gcu
# --- PATCH for Gradio utils schema bug ---
_orig_json = _gcu.json_schema_to_python_type
_orig_get = _gcu.get_type
def _patched_json_to_py(schema, defs=None):
if not isinstance(schema, dict):
return "any"
try:
return _orig_json(schema, defs)
except Exception:
return "any"
def _patched_get_type(schema):
if not isinstance(schema, dict):
return "any"
try:
return _orig_get(schema)
except Exception:
return "any"
_gcu.json_schema_to_python_type = _patched_json_to_py
_gcu.get_type = _patched_get_type
# --- SQLite Database initialization ---
DB_NAME = "vocabulary.db"
conn = sqlite3.connect(DB_NAME)
conn.execute("""
CREATE TABLE IF NOT EXISTS vocabulary (
user_id TEXT,
language TEXT,
word TEXT,
timestamp TEXT,
UNIQUE(user_id, language, word)
)
""")
conn.commit()
conn.close()
# --- Save word to database ---
def save_word_to_db(user_id: str, language: str, word: str):
ts = datetime.datetime.now().isoformat()
conn = sqlite3.connect(DB_NAME)
conn.execute(
"INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
(user_id, language, word, ts)
)
conn.commit()
conn.close()
# --- Retrieve known words for user/language ---
def get_user_vocabulary(user_id: str, language: str) -> set[str]:
conn = sqlite3.connect(DB_NAME)
rows = conn.execute(
"SELECT word FROM vocabulary WHERE user_id=? AND language=?",
(user_id, language)
).fetchall()
conn.close()
return {r[0] for r in rows}
# --- Load NLP models ---
nlp = spacy.load("es_core_news_lg")
tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2")
lemmatizer = WordNetLemmatizer()
# ---Log to CSV (grammar corrections and sentence suggestions) ---
def log_to_csv(filename, row, fieldnames):
file_exists = os.path.isfile(filename)
with open(filename, "a", newline='', encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader()
writer.writerow(row)
# --- File Import ---
def import_file(path: str) -> str:
ext = os.path.splitext(path)[1].lower()
if ext == ".pdf":
pages = []
with pdfplumber.open(path) as pdf:
for p in pdf.pages:
pages.append(p.extract_text() or "")
return "\n".join(pages)
if ext == ".csv":
df = pd.read_csv(path)
if "text" in df:
return "\n".join(df["text"].astype(str))
raise ValueError("CSV saknar kolumnen 'text'.")
if ext == ".txt":
return open(path, encoding="utf-8").read()
raise ValueError(f"Okänt filformat: {ext}")
# --- Grammar Correction ---
def correct_grammar(text: str, language: str) -> str:
if language == "es":
corrected = []
for sent in nlp(text).sents:
s = sent.text.strip()
if not s: continue
inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
out = model.generate(
**inp,
max_new_tokens=inp.input_ids.shape[1],
num_beams=5,
early_stopping=True
)
corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
return " ".join(corrected)
# English: LanguageTool API
resp = requests.post(
"https://api.languagetool.org/v2/check",
data={"text": text, "language": language}
).json()
for m in reversed(resp.get("matches", [])):
off, ln = m["offset"], m["length"]
repls = m.get("replacements", [])
val = repls[0]["value"] if repls else ""
text = text[:off] + val + text[off+ln:]
return text
# --- Analyze known and unknown words ---
def analyze_text(text: str, user_id: str, language: str):
toks = word_tokenize(text)
lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
vocab = get_user_vocabulary(user_id, language)
known = [w for w in lems if w in vocab]
unknown = [w for w in lems if w not in vocab]
return known, unknown
# --- Generate sentence using GPT2 based on unknown words ---
def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
kn, un = analyze_text(text, user_id, language)
if not un:
return "Inga okända ord att generera mening med."
chosen = random.sample(un, min(num_unknown, len(un)))
if language == "es":
prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
tokenizer = gpt2_tokenizer_es
model = gpt2_model_es
else:
prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
tokenizer = gpt2_tokenizer_en
model = gpt2_model_en
inp = tokenizer(prompt, return_tensors="pt", truncation=True)
outs = model.generate(
**inp,
max_new_tokens=50,
do_sample=True,
top_k=50,
top_p=0.95
)
gen = tokenizer.decode(outs[0], skip_special_tokens=True)
body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
sentence = (body.split(".")[0].strip() + ".") if "." in body else body
if not any(c.isalpha() for c in sentence):
return "Misslyckades att generera meningsfull övningsmening."
return sentence
# --- Gradio process callback ---
def process(user, language, txt, file, do_grammar, do_save):
try:
if txt and txt.strip():
text = txt.strip()
elif file:
text = import_file(file.name)
else:
return "", "", "", "Ingen text angiven.", ""
out = correct_grammar(text, language) if do_grammar else text
kn, un = analyze_text(out, user, language)
status = ""
if do_save and un:
for w in un:
save_word_to_db(user, language, w)
status = f"Sparade {len(un)} ord."
# Logga grammatikrättning till CSV
log_to_csv(
"grammarlog.csv",
{
"user": user, "language": language, "input": text,
"output": out, "timestamp": datetime.datetime.now().isoformat()
},
["user", "language", "input", "output", "timestamp"]
)
return out, ", ".join(kn), ", ".join(un), status, ""
except Exception as e:
import traceback
tb = traceback.format_exc()
return "", "", "", f"FEL i process:\n{tb}", ""
# --- Sentence generation callback ---
def coherent_fn(user, language, txt, num):
try:
suggestion = generate_coherent_sentence(txt or "", user, language, num)
# Logga övningsförslag till CSV
log_to_csv(
"sentencelog.csv",
{
"user": user, "language": language, "input": txt,
"output": suggestion, "timestamp": datetime.datetime.now().isoformat()
},
["user", "language", "input", "output", "timestamp"]
)
return suggestion
except Exception as e:
return f"Fel vid generering: {e}"
# --- Gradio UI ---
demo = gr.Blocks()
with demo:
gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
with gr.Row():
user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
with gr.Column():
manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
grammar_cb = gr.Checkbox(label="Grammatik­rättning")
autosave_cb = gr.Checkbox(label="Spara okända ord")
run_btn = gr.Button("Kör analys & korrigering")
num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
coherent_btn = gr.Button("Koherent övningsmening")
corr_out = gr.Textbox(label="Korrigerad text", lines=4)
known_out = gr.Textbox(label="Kända ord")
unknown_out = gr.Textbox(label="Okända ord")
status_out = gr.Textbox(label="Status")
coherent_out = gr.Textbox(label="Koherent övningsmening")
# --- Knapparnas click‐kopplingar ---
run_btn.click(
fn=process,
inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
)
coherent_btn.click(
fn=coherent_fn,
inputs=[user_input, lang_dd, manual_input, num_slider],
outputs=[coherent_out]
)
#Make sure to change language for the textfile to be analyzed in its target language
# --- Start app ---
if __name__ == "__main__":
url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
print("Appen körs på:", url)