Spaces:

JacobWP
/

language_app_Jacob_WP

Runtime error

File size: 11,938 Bytes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 16:49:22 2025

@author: jacobwildt-persson
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -----------------------------------------------
# Requirements & Setup Instructions
# -----------------------------------------------

# Python version:
# Requires Python 3.10 or later (tested on 3.12)


# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
#

# Install all required packages:
# Run these commands in the terminal:

# pip install --upgrade gradio
# pip install pdfplumber
# pip install nltk
# pip install transformers
# pip install -U spacy

# Download language models:
# python -m spacy download es_core_news_lg
# python -m spacy download en_core_web_lg  # if you add NER for English

# Check Gradio version used:
# import gradio as gr
# print(gr.__version__)  # Gradio version 4.18.0

# 🔗 Reference: Gradio Quickstart Guide
# https://www.gradio.app/guides/quickstart
#Hugging Face
 # https://huggingface.co/models

# Enghlish API model
# LanguageTool API: https://languagetool.org/http-api/swagger



#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
# python -m spacy download es_core_news_lg
#python -m nltk.downloader punkt wordnet
# -----------------------------------------------
"""
Language learning app with Gradio UI, on & multiple users:
- Import text from file (.txt/.csv/.pdf) or manual text input
- Grammar correction via transformers (Spanish) or LanguageTool API (English)
- Analyze text (known/unknown words) per user & language
- Save unknown words as known
- Generate coherent practice sentence (Spanish & English)
- Log grammar corrections and practice sentence suggestions to CSV
"""
import os
import datetime
import sqlite3
import requests
import random
import pandas as pd
import pdfplumber
import spacy
import csv
# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
import sqlite3

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
import gradio as gr
import gradio_client.utils as _gcu

# --- PATCH for Gradio utils schema bug ---
_orig_json = _gcu.json_schema_to_python_type
_orig_get = _gcu.get_type

def _patched_json_to_py(schema, defs=None):
    if not isinstance(schema, dict):
        return "any"
    try:
        return _orig_json(schema, defs)
    except Exception:
        return "any"

def _patched_get_type(schema):
    if not isinstance(schema, dict):
        return "any"
    try:
        return _orig_get(schema)
    except Exception:
        return "any"

_gcu.json_schema_to_python_type = _patched_json_to_py
_gcu.get_type = _patched_get_type

# --- SQLite Database initialization ---
DB_NAME = "vocabulary.db"
conn = sqlite3.connect(DB_NAME)
conn.execute("""
    CREATE TABLE IF NOT EXISTS vocabulary (
      user_id   TEXT,
      language  TEXT,
      word      TEXT,
      timestamp TEXT,
      UNIQUE(user_id, language, word)
    )
""")
conn.commit()
conn.close()

# --- Save word to database ---
def save_word_to_db(user_id: str, language: str, word: str):
    ts = datetime.datetime.now().isoformat()
    conn = sqlite3.connect(DB_NAME)
    conn.execute(
        "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
        (user_id, language, word, ts)
    )
    conn.commit()
    conn.close()

# --- Retrieve known words for user/language ---
def get_user_vocabulary(user_id: str, language: str) -> set[str]:
    conn = sqlite3.connect(DB_NAME)
    rows = conn.execute(
        "SELECT word FROM vocabulary WHERE user_id=? AND language=?",
        (user_id, language)
    ).fetchall()
    conn.close()
    return {r[0] for r in rows}

# --- Load NLP models ---
nlp = spacy.load("es_core_news_lg")
tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
model     = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
gpt2_model_es     = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
gpt2_model_en     = AutoModelForCausalLM.from_pretrained("gpt2")
lemmatizer        = WordNetLemmatizer()

# ---Log to CSV (grammar corrections and sentence suggestions)  ---
def log_to_csv(filename, row, fieldnames):
    file_exists = os.path.isfile(filename)
    with open(filename, "a", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

# --- File Import ---
def import_file(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        pages = []
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                pages.append(p.extract_text() or "")
        return "\n".join(pages)
    if ext == ".csv":
        df = pd.read_csv(path)
        if "text" in df:
            return "\n".join(df["text"].astype(str))
        raise ValueError("CSV saknar kolumnen 'text'.")
    if ext == ".txt":
        return open(path, encoding="utf-8").read()
    raise ValueError(f"Okänt filformat: {ext}")
    
# --- Grammar Correction ---

def correct_grammar(text: str, language: str) -> str:
    if language == "es":
        corrected = []
        for sent in nlp(text).sents:
            s = sent.text.strip()
            if not s: continue
            inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
            out = model.generate(
                **inp,
                max_new_tokens=inp.input_ids.shape[1],
                num_beams=5,
                early_stopping=True
            )
            corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
        return " ".join(corrected)
    # English: LanguageTool API
    resp = requests.post(
        "https://api.languagetool.org/v2/check",
        data={"text": text, "language": language}
    ).json()
    for m in reversed(resp.get("matches", [])):
        off, ln = m["offset"], m["length"]
        repls = m.get("replacements", [])
        val = repls[0]["value"] if repls else ""
        text = text[:off] + val + text[off+ln:]
    return text

# --- Analyze known and unknown words ---

def analyze_text(text: str, user_id: str, language: str):
    toks = word_tokenize(text)
    lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
    vocab = get_user_vocabulary(user_id, language)
    known   = [w for w in lems if w in vocab]
    unknown = [w for w in lems if w not in vocab]
    return known, unknown
# --- Generate sentence using GPT2 based on unknown words ---
def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
    kn, un = analyze_text(text, user_id, language)
    if not un:
        return "Inga okända ord att generera mening med."
    chosen = random.sample(un, min(num_unknown, len(un)))
    if language == "es":
        prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
        tokenizer = gpt2_tokenizer_es
        model     = gpt2_model_es
    else:
        prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
        tokenizer = gpt2_tokenizer_en
        model     = gpt2_model_en
    inp = tokenizer(prompt, return_tensors="pt", truncation=True)
    outs = model.generate(
        **inp,
        max_new_tokens=50,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    gen = tokenizer.decode(outs[0], skip_special_tokens=True)
    body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
    sentence = (body.split(".")[0].strip() + ".") if "." in body else body
    if not any(c.isalpha() for c in sentence):
        return "Misslyckades att generera meningsfull övningsmening."
    return sentence

 
# --- Gradio process callback ---
def process(user, language, txt, file, do_grammar, do_save):
    try:
        if txt and txt.strip():
            text = txt.strip()
        elif file:
            text = import_file(file.name)
        else:
            return "", "", "", "Ingen text angiven.", ""
        out = correct_grammar(text, language) if do_grammar else text
        kn, un = analyze_text(out, user, language)
        status = ""
        if do_save and un:
            for w in un:
                save_word_to_db(user, language, w)
            status = f"Sparade {len(un)} ord."
        # Logga grammatikrättning till CSV
        log_to_csv(
            "grammarlog.csv",
            {
                "user": user, "language": language, "input": text,
                "output": out, "timestamp": datetime.datetime.now().isoformat()
            },
            ["user", "language", "input", "output", "timestamp"]
        )
        return out, ", ".join(kn), ", ".join(un), status, ""
    except Exception as e:
        import traceback
        tb = traceback.format_exc()
        return "", "", "", f"FEL i process:\n{tb}", ""

# --- Sentence generation callback ---
def coherent_fn(user, language, txt, num):
    try:
        suggestion = generate_coherent_sentence(txt or "", user, language, num)
        # Logga övningsförslag till CSV
        log_to_csv(
            "sentencelog.csv",
            {
                "user": user, "language": language, "input": txt,
                "output": suggestion, "timestamp": datetime.datetime.now().isoformat()
            },
            ["user", "language", "input", "output", "timestamp"]
        )
        return suggestion
    except Exception as e:
        return f"Fel vid generering: {e}"

# --- Gradio UI ---
demo = gr.Blocks()
with demo:
    gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
    with gr.Row():
        user_input  = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
        lang_dd     = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
    with gr.Column():
        manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
        file_input   = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
        grammar_cb   = gr.Checkbox(label="Grammatikrättning")
        autosave_cb  = gr.Checkbox(label="Spara okända ord")
        run_btn      = gr.Button("Kör analys & korrigering")
        num_slider   = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
        coherent_btn = gr.Button("Koherent övningsmening")

    corr_out    = gr.Textbox(label="Korrigerad text", lines=4)
    known_out   = gr.Textbox(label="Kända ord")
    unknown_out = gr.Textbox(label="Okända ord")
    status_out  = gr.Textbox(label="Status")
    coherent_out = gr.Textbox(label="Koherent övningsmening")

    # --- Knapparnas click‐kopplingar ---
    run_btn.click(
        fn=process,
        inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
        outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
    )
    coherent_btn.click(
        fn=coherent_fn,
        inputs=[user_input, lang_dd, manual_input, num_slider],
        outputs=[coherent_out]
    )
    #Make sure to change language for the textfile to be analyzed in its target language

# --- Start app --- 
if __name__ == "__main__":
    url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
    print("Appen körs på:", url)