File size: 11,938 Bytes
0b721e9
 
d23325f
0b721e9
1a0652e
0b721e9
 
1a0652e
0b721e9
 
 
 
 
1a0652e
0b721e9
 
1a0652e
 
0b721e9
 
 
 
 
1a0652e
0b721e9
 
1a0652e
0b721e9
 
 
 
 
1a0652e
0b721e9
 
 
1a0652e
0b721e9
 
 
1a0652e
0b721e9
 
 
 
1a0652e
0b721e9
 
1a0652e
 
d23325f
0b721e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0652e
0b721e9
 
 
 
 
 
 
 
 
 
 
1a0652e
0b721e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0652e
0b721e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0652e
0b721e9
 
 
 
 
 
1a0652e
0b721e9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 16:49:22 2025

@author: jacobwildt-persson
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -----------------------------------------------
# Requirements & Setup Instructions
# -----------------------------------------------

# Python version:
# Requires Python 3.10 or later (tested on 3.12)


# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
#

# Install all required packages:
# Run these commands in the terminal:

# pip install --upgrade gradio
# pip install pdfplumber
# pip install nltk
# pip install transformers
# pip install -U spacy

# Download language models:
# python -m spacy download es_core_news_lg
# python -m spacy download en_core_web_lg  # if you add NER for English

# Check Gradio version used:
# import gradio as gr
# print(gr.__version__)  # Gradio version 4.18.0

# 🔗 Reference: Gradio Quickstart Guide
# https://www.gradio.app/guides/quickstart
#Hugging Face
 # https://huggingface.co/models

# Enghlish API model
# LanguageTool API: https://languagetool.org/http-api/swagger



#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
# Recreate the environment with theese command in terminal
# conda env create -f environment.yml
# conda activate sprakenv
# python -m spacy download es_core_news_lg
#python -m nltk.downloader punkt wordnet
# -----------------------------------------------
"""
Language learning app with Gradio UI, on & multiple users:
- Import text from file (.txt/.csv/.pdf) or manual text input
- Grammar correction via transformers (Spanish) or LanguageTool API (English)
- Analyze text (known/unknown words) per user & language
- Save unknown words as known
- Generate coherent practice sentence (Spanish & English)
- Log grammar corrections and practice sentence suggestions to CSV
"""
import os
import datetime
import sqlite3
import requests
import random
import pandas as pd
import pdfplumber
import spacy
import csv
# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
import sqlite3

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
import gradio as gr
import gradio_client.utils as _gcu

# --- PATCH for Gradio utils schema bug ---
_orig_json = _gcu.json_schema_to_python_type
_orig_get = _gcu.get_type

def _patched_json_to_py(schema, defs=None):
    if not isinstance(schema, dict):
        return "any"
    try:
        return _orig_json(schema, defs)
    except Exception:
        return "any"

def _patched_get_type(schema):
    if not isinstance(schema, dict):
        return "any"
    try:
        return _orig_get(schema)
    except Exception:
        return "any"

_gcu.json_schema_to_python_type = _patched_json_to_py
_gcu.get_type = _patched_get_type

# --- SQLite Database initialization ---
DB_NAME = "vocabulary.db"
conn = sqlite3.connect(DB_NAME)
conn.execute("""
    CREATE TABLE IF NOT EXISTS vocabulary (
      user_id   TEXT,
      language  TEXT,
      word      TEXT,
      timestamp TEXT,
      UNIQUE(user_id, language, word)
    )
""")
conn.commit()
conn.close()

# --- Save word to database ---
def save_word_to_db(user_id: str, language: str, word: str):
    ts = datetime.datetime.now().isoformat()
    conn = sqlite3.connect(DB_NAME)
    conn.execute(
        "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
        (user_id, language, word, ts)
    )
    conn.commit()
    conn.close()

# --- Retrieve known words for user/language ---
def get_user_vocabulary(user_id: str, language: str) -> set[str]:
    conn = sqlite3.connect(DB_NAME)
    rows = conn.execute(
        "SELECT word FROM vocabulary WHERE user_id=? AND language=?",
        (user_id, language)
    ).fetchall()
    conn.close()
    return {r[0] for r in rows}

# --- Load NLP models ---
nlp = spacy.load("es_core_news_lg")
tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
model     = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
gpt2_model_es     = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
gpt2_model_en     = AutoModelForCausalLM.from_pretrained("gpt2")
lemmatizer        = WordNetLemmatizer()

# ---Log to CSV (grammar corrections and sentence suggestions)  ---
def log_to_csv(filename, row, fieldnames):
    file_exists = os.path.isfile(filename)
    with open(filename, "a", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

# --- File Import ---
def import_file(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        pages = []
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                pages.append(p.extract_text() or "")
        return "\n".join(pages)
    if ext == ".csv":
        df = pd.read_csv(path)
        if "text" in df:
            return "\n".join(df["text"].astype(str))
        raise ValueError("CSV saknar kolumnen 'text'.")
    if ext == ".txt":
        return open(path, encoding="utf-8").read()
    raise ValueError(f"Okänt filformat: {ext}")
    
# --- Grammar Correction ---

def correct_grammar(text: str, language: str) -> str:
    if language == "es":
        corrected = []
        for sent in nlp(text).sents:
            s = sent.text.strip()
            if not s: continue
            inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
            out = model.generate(
                **inp,
                max_new_tokens=inp.input_ids.shape[1],
                num_beams=5,
                early_stopping=True
            )
            corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
        return " ".join(corrected)
    # English: LanguageTool API
    resp = requests.post(
        "https://api.languagetool.org/v2/check",
        data={"text": text, "language": language}
    ).json()
    for m in reversed(resp.get("matches", [])):
        off, ln = m["offset"], m["length"]
        repls = m.get("replacements", [])
        val = repls[0]["value"] if repls else ""
        text = text[:off] + val + text[off+ln:]
    return text

# --- Analyze known and unknown words ---

def analyze_text(text: str, user_id: str, language: str):
    toks = word_tokenize(text)
    lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
    vocab = get_user_vocabulary(user_id, language)
    known   = [w for w in lems if w in vocab]
    unknown = [w for w in lems if w not in vocab]
    return known, unknown
# --- Generate sentence using GPT2 based on unknown words ---
def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
    kn, un = analyze_text(text, user_id, language)
    if not un:
        return "Inga okända ord att generera mening med."
    chosen = random.sample(un, min(num_unknown, len(un)))
    if language == "es":
        prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
        tokenizer = gpt2_tokenizer_es
        model     = gpt2_model_es
    else:
        prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
        tokenizer = gpt2_tokenizer_en
        model     = gpt2_model_en
    inp = tokenizer(prompt, return_tensors="pt", truncation=True)
    outs = model.generate(
        **inp,
        max_new_tokens=50,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    gen = tokenizer.decode(outs[0], skip_special_tokens=True)
    body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
    sentence = (body.split(".")[0].strip() + ".") if "." in body else body
    if not any(c.isalpha() for c in sentence):
        return "Misslyckades att generera meningsfull övningsmening."
    return sentence

 
# --- Gradio process callback ---
def process(user, language, txt, file, do_grammar, do_save):
    try:
        if txt and txt.strip():
            text = txt.strip()
        elif file:
            text = import_file(file.name)
        else:
            return "", "", "", "Ingen text angiven.", ""
        out = correct_grammar(text, language) if do_grammar else text
        kn, un = analyze_text(out, user, language)
        status = ""
        if do_save and un:
            for w in un:
                save_word_to_db(user, language, w)
            status = f"Sparade {len(un)} ord."
        # Logga grammatikrättning till CSV
        log_to_csv(
            "grammarlog.csv",
            {
                "user": user, "language": language, "input": text,
                "output": out, "timestamp": datetime.datetime.now().isoformat()
            },
            ["user", "language", "input", "output", "timestamp"]
        )
        return out, ", ".join(kn), ", ".join(un), status, ""
    except Exception as e:
        import traceback
        tb = traceback.format_exc()
        return "", "", "", f"FEL i process:\n{tb}", ""

# --- Sentence generation callback ---
def coherent_fn(user, language, txt, num):
    try:
        suggestion = generate_coherent_sentence(txt or "", user, language, num)
        # Logga övningsförslag till CSV
        log_to_csv(
            "sentencelog.csv",
            {
                "user": user, "language": language, "input": txt,
                "output": suggestion, "timestamp": datetime.datetime.now().isoformat()
            },
            ["user", "language", "input", "output", "timestamp"]
        )
        return suggestion
    except Exception as e:
        return f"Fel vid generering: {e}"

# --- Gradio UI ---
demo = gr.Blocks()
with demo:
    gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
    with gr.Row():
        user_input  = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
        lang_dd     = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
    with gr.Column():
        manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
        file_input   = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
        grammar_cb   = gr.Checkbox(label="Grammatik­rättning")
        autosave_cb  = gr.Checkbox(label="Spara okända ord")
        run_btn      = gr.Button("Kör analys & korrigering")
        num_slider   = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
        coherent_btn = gr.Button("Koherent övningsmening")

    corr_out    = gr.Textbox(label="Korrigerad text", lines=4)
    known_out   = gr.Textbox(label="Kända ord")
    unknown_out = gr.Textbox(label="Okända ord")
    status_out  = gr.Textbox(label="Status")
    coherent_out = gr.Textbox(label="Koherent övningsmening")

    # --- Knapparnas click‐kopplingar ---
    run_btn.click(
        fn=process,
        inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
        outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
    )
    coherent_btn.click(
        fn=coherent_fn,
        inputs=[user_input, lang_dd, manual_input, num_slider],
        outputs=[coherent_out]
    )
    #Make sure to change language for the textfile to be analyzed in its target language

# --- Start app --- 
if __name__ == "__main__":
    url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
    print("Appen körs på:", url)