Spaces:
Runtime error
Runtime error
Delete app.py 2.txt
Browse files- app.py 2.txt +0 -337
app.py 2.txt
DELETED
@@ -1,337 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
"""
|
4 |
-
Created on Mon May 19 16:49:22 2025
|
5 |
-
|
6 |
-
@author: jacobwildt-persson
|
7 |
-
"""
|
8 |
-
|
9 |
-
#!/usr/bin/env python3
|
10 |
-
# -*- coding: utf-8 -*-
|
11 |
-
# -----------------------------------------------
|
12 |
-
# Requirements & Setup Instructions
|
13 |
-
# -----------------------------------------------
|
14 |
-
|
15 |
-
# Python version:
|
16 |
-
# Requires Python 3.10 or later (tested on 3.12)
|
17 |
-
|
18 |
-
|
19 |
-
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
|
20 |
-
# Recreate the environment with theese command in terminal
|
21 |
-
# conda env create -f environment.yml
|
22 |
-
# conda activate sprakenv
|
23 |
-
#
|
24 |
-
|
25 |
-
# Install all required packages:
|
26 |
-
# Run these commands in the terminal:
|
27 |
-
|
28 |
-
# pip install --upgrade gradio
|
29 |
-
# pip install pdfplumber
|
30 |
-
# pip install nltk
|
31 |
-
# pip install transformers
|
32 |
-
# pip install -U spacy
|
33 |
-
|
34 |
-
# Download language models:
|
35 |
-
# python -m spacy download es_core_news_lg
|
36 |
-
# python -m spacy download en_core_web_lg # if you add NER for English
|
37 |
-
|
38 |
-
# Check Gradio version used:
|
39 |
-
# import gradio as gr
|
40 |
-
# print(gr.__version__) # Gradio version 4.18.0
|
41 |
-
|
42 |
-
# 🔗 Reference: Gradio Quickstart Guide
|
43 |
-
# https://www.gradio.app/guides/quickstart
|
44 |
-
#Hugging Face
|
45 |
-
# https://huggingface.co/models
|
46 |
-
|
47 |
-
# Enghlish API model
|
48 |
-
# LanguageTool API: https://languagetool.org/http-api/swagger
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
|
53 |
-
# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
|
54 |
-
# Recreate the environment with theese command in terminal
|
55 |
-
# conda env create -f environment.yml
|
56 |
-
# conda activate sprakenv
|
57 |
-
# python -m spacy download es_core_news_lg
|
58 |
-
#python -m nltk.downloader punkt wordnet
|
59 |
-
# -----------------------------------------------
|
60 |
-
"""
|
61 |
-
Language learning app with Gradio UI, on & multiple users:
|
62 |
-
- Import text from file (.txt/.csv/.pdf) or manual text input
|
63 |
-
- Grammar correction via transformers (Spanish) or LanguageTool API (English)
|
64 |
-
- Analyze text (known/unknown words) per user & language
|
65 |
-
- Save unknown words as known
|
66 |
-
- Generate coherent practice sentence (Spanish & English)
|
67 |
-
- Log grammar corrections and practice sentence suggestions to CSV
|
68 |
-
"""
|
69 |
-
import os
|
70 |
-
import datetime
|
71 |
-
import sqlite3
|
72 |
-
import requests
|
73 |
-
import random
|
74 |
-
import pandas as pd
|
75 |
-
import pdfplumber
|
76 |
-
import spacy
|
77 |
-
import csv
|
78 |
-
# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
|
79 |
-
import sqlite3
|
80 |
-
|
81 |
-
from nltk.tokenize import word_tokenize
|
82 |
-
from nltk.stem import WordNetLemmatizer
|
83 |
-
from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
|
84 |
-
import gradio as gr
|
85 |
-
import gradio_client.utils as _gcu
|
86 |
-
|
87 |
-
# --- PATCH for Gradio utils schema bug ---
|
88 |
-
_orig_json = _gcu.json_schema_to_python_type
|
89 |
-
_orig_get = _gcu.get_type
|
90 |
-
|
91 |
-
def _patched_json_to_py(schema, defs=None):
|
92 |
-
if not isinstance(schema, dict):
|
93 |
-
return "any"
|
94 |
-
try:
|
95 |
-
return _orig_json(schema, defs)
|
96 |
-
except Exception:
|
97 |
-
return "any"
|
98 |
-
|
99 |
-
def _patched_get_type(schema):
|
100 |
-
if not isinstance(schema, dict):
|
101 |
-
return "any"
|
102 |
-
try:
|
103 |
-
return _orig_get(schema)
|
104 |
-
except Exception:
|
105 |
-
return "any"
|
106 |
-
|
107 |
-
_gcu.json_schema_to_python_type = _patched_json_to_py
|
108 |
-
_gcu.get_type = _patched_get_type
|
109 |
-
|
110 |
-
# --- SQLite Database initialization ---
|
111 |
-
DB_NAME = "vocabulary.db"
|
112 |
-
conn = sqlite3.connect(DB_NAME)
|
113 |
-
conn.execute("""
|
114 |
-
CREATE TABLE IF NOT EXISTS vocabulary (
|
115 |
-
user_id TEXT,
|
116 |
-
language TEXT,
|
117 |
-
word TEXT,
|
118 |
-
timestamp TEXT,
|
119 |
-
UNIQUE(user_id, language, word)
|
120 |
-
)
|
121 |
-
""")
|
122 |
-
conn.commit()
|
123 |
-
conn.close()
|
124 |
-
|
125 |
-
# --- Save word to database ---
|
126 |
-
def save_word_to_db(user_id: str, language: str, word: str):
|
127 |
-
ts = datetime.datetime.now().isoformat()
|
128 |
-
conn = sqlite3.connect(DB_NAME)
|
129 |
-
conn.execute(
|
130 |
-
"INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
|
131 |
-
(user_id, language, word, ts)
|
132 |
-
)
|
133 |
-
conn.commit()
|
134 |
-
conn.close()
|
135 |
-
|
136 |
-
# --- Retrieve known words for user/language ---
|
137 |
-
def get_user_vocabulary(user_id: str, language: str) -> set[str]:
|
138 |
-
conn = sqlite3.connect(DB_NAME)
|
139 |
-
rows = conn.execute(
|
140 |
-
"SELECT word FROM vocabulary WHERE user_id=? AND language=?",
|
141 |
-
(user_id, language)
|
142 |
-
).fetchall()
|
143 |
-
conn.close()
|
144 |
-
return {r[0] for r in rows}
|
145 |
-
|
146 |
-
# --- Load NLP models ---
|
147 |
-
nlp = spacy.load("es_core_news_lg")
|
148 |
-
tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
|
149 |
-
model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
|
150 |
-
gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
|
151 |
-
gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
|
152 |
-
gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
|
153 |
-
gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2")
|
154 |
-
lemmatizer = WordNetLemmatizer()
|
155 |
-
|
156 |
-
# ---Log to CSV (grammar corrections and sentence suggestions) ---
|
157 |
-
def log_to_csv(filename, row, fieldnames):
|
158 |
-
file_exists = os.path.isfile(filename)
|
159 |
-
with open(filename, "a", newline='', encoding="utf-8") as csvfile:
|
160 |
-
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
161 |
-
if not file_exists:
|
162 |
-
writer.writeheader()
|
163 |
-
writer.writerow(row)
|
164 |
-
|
165 |
-
# --- File Import ---
|
166 |
-
def import_file(path: str) -> str:
|
167 |
-
ext = os.path.splitext(path)[1].lower()
|
168 |
-
if ext == ".pdf":
|
169 |
-
pages = []
|
170 |
-
with pdfplumber.open(path) as pdf:
|
171 |
-
for p in pdf.pages:
|
172 |
-
pages.append(p.extract_text() or "")
|
173 |
-
return "\n".join(pages)
|
174 |
-
if ext == ".csv":
|
175 |
-
df = pd.read_csv(path)
|
176 |
-
if "text" in df:
|
177 |
-
return "\n".join(df["text"].astype(str))
|
178 |
-
raise ValueError("CSV saknar kolumnen 'text'.")
|
179 |
-
if ext == ".txt":
|
180 |
-
return open(path, encoding="utf-8").read()
|
181 |
-
raise ValueError(f"Okänt filformat: {ext}")
|
182 |
-
|
183 |
-
# --- Grammar Correction ---
|
184 |
-
|
185 |
-
def correct_grammar(text: str, language: str) -> str:
|
186 |
-
if language == "es":
|
187 |
-
corrected = []
|
188 |
-
for sent in nlp(text).sents:
|
189 |
-
s = sent.text.strip()
|
190 |
-
if not s: continue
|
191 |
-
inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
|
192 |
-
out = model.generate(
|
193 |
-
**inp,
|
194 |
-
max_new_tokens=inp.input_ids.shape[1],
|
195 |
-
num_beams=5,
|
196 |
-
early_stopping=True
|
197 |
-
)
|
198 |
-
corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
|
199 |
-
return " ".join(corrected)
|
200 |
-
# English: LanguageTool API
|
201 |
-
resp = requests.post(
|
202 |
-
"https://api.languagetool.org/v2/check",
|
203 |
-
data={"text": text, "language": language}
|
204 |
-
).json()
|
205 |
-
for m in reversed(resp.get("matches", [])):
|
206 |
-
off, ln = m["offset"], m["length"]
|
207 |
-
repls = m.get("replacements", [])
|
208 |
-
val = repls[0]["value"] if repls else ""
|
209 |
-
text = text[:off] + val + text[off+ln:]
|
210 |
-
return text
|
211 |
-
|
212 |
-
# --- Analyze known and unknown words ---
|
213 |
-
|
214 |
-
def analyze_text(text: str, user_id: str, language: str):
|
215 |
-
toks = word_tokenize(text)
|
216 |
-
lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
|
217 |
-
vocab = get_user_vocabulary(user_id, language)
|
218 |
-
known = [w for w in lems if w in vocab]
|
219 |
-
unknown = [w for w in lems if w not in vocab]
|
220 |
-
return known, unknown
|
221 |
-
# --- Generate sentence using GPT2 based on unknown words ---
|
222 |
-
def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
|
223 |
-
kn, un = analyze_text(text, user_id, language)
|
224 |
-
if not un:
|
225 |
-
return "Inga okända ord att generera mening med."
|
226 |
-
chosen = random.sample(un, min(num_unknown, len(un)))
|
227 |
-
if language == "es":
|
228 |
-
prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
|
229 |
-
tokenizer = gpt2_tokenizer_es
|
230 |
-
model = gpt2_model_es
|
231 |
-
else:
|
232 |
-
prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
|
233 |
-
tokenizer = gpt2_tokenizer_en
|
234 |
-
model = gpt2_model_en
|
235 |
-
inp = tokenizer(prompt, return_tensors="pt", truncation=True)
|
236 |
-
outs = model.generate(
|
237 |
-
**inp,
|
238 |
-
max_new_tokens=50,
|
239 |
-
do_sample=True,
|
240 |
-
top_k=50,
|
241 |
-
top_p=0.95
|
242 |
-
)
|
243 |
-
gen = tokenizer.decode(outs[0], skip_special_tokens=True)
|
244 |
-
body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
|
245 |
-
sentence = (body.split(".")[0].strip() + ".") if "." in body else body
|
246 |
-
if not any(c.isalpha() for c in sentence):
|
247 |
-
return "Misslyckades att generera meningsfull övningsmening."
|
248 |
-
return sentence
|
249 |
-
|
250 |
-
|
251 |
-
# --- Gradio process callback ---
|
252 |
-
def process(user, language, txt, file, do_grammar, do_save):
|
253 |
-
try:
|
254 |
-
if txt and txt.strip():
|
255 |
-
text = txt.strip()
|
256 |
-
elif file:
|
257 |
-
text = import_file(file.name)
|
258 |
-
else:
|
259 |
-
return "", "", "", "Ingen text angiven.", ""
|
260 |
-
out = correct_grammar(text, language) if do_grammar else text
|
261 |
-
kn, un = analyze_text(out, user, language)
|
262 |
-
status = ""
|
263 |
-
if do_save and un:
|
264 |
-
for w in un:
|
265 |
-
save_word_to_db(user, language, w)
|
266 |
-
status = f"Sparade {len(un)} ord."
|
267 |
-
# Logga grammatikrättning till CSV
|
268 |
-
log_to_csv(
|
269 |
-
"grammarlog.csv",
|
270 |
-
{
|
271 |
-
"user": user, "language": language, "input": text,
|
272 |
-
"output": out, "timestamp": datetime.datetime.now().isoformat()
|
273 |
-
},
|
274 |
-
["user", "language", "input", "output", "timestamp"]
|
275 |
-
)
|
276 |
-
return out, ", ".join(kn), ", ".join(un), status, ""
|
277 |
-
except Exception as e:
|
278 |
-
import traceback
|
279 |
-
tb = traceback.format_exc()
|
280 |
-
return "", "", "", f"FEL i process:\n{tb}", ""
|
281 |
-
|
282 |
-
# --- Sentence generation callback ---
|
283 |
-
def coherent_fn(user, language, txt, num):
|
284 |
-
try:
|
285 |
-
suggestion = generate_coherent_sentence(txt or "", user, language, num)
|
286 |
-
# Logga övningsförslag till CSV
|
287 |
-
log_to_csv(
|
288 |
-
"sentencelog.csv",
|
289 |
-
{
|
290 |
-
"user": user, "language": language, "input": txt,
|
291 |
-
"output": suggestion, "timestamp": datetime.datetime.now().isoformat()
|
292 |
-
},
|
293 |
-
["user", "language", "input", "output", "timestamp"]
|
294 |
-
)
|
295 |
-
return suggestion
|
296 |
-
except Exception as e:
|
297 |
-
return f"Fel vid generering: {e}"
|
298 |
-
|
299 |
-
# --- Gradio UI ---
|
300 |
-
demo = gr.Blocks()
|
301 |
-
with demo:
|
302 |
-
gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
|
303 |
-
with gr.Row():
|
304 |
-
user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
|
305 |
-
lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
|
306 |
-
with gr.Column():
|
307 |
-
manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
|
308 |
-
file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
|
309 |
-
grammar_cb = gr.Checkbox(label="Grammatikrättning")
|
310 |
-
autosave_cb = gr.Checkbox(label="Spara okända ord")
|
311 |
-
run_btn = gr.Button("Kör analys & korrigering")
|
312 |
-
num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
|
313 |
-
coherent_btn = gr.Button("Koherent övningsmening")
|
314 |
-
|
315 |
-
corr_out = gr.Textbox(label="Korrigerad text", lines=4)
|
316 |
-
known_out = gr.Textbox(label="Kända ord")
|
317 |
-
unknown_out = gr.Textbox(label="Okända ord")
|
318 |
-
status_out = gr.Textbox(label="Status")
|
319 |
-
coherent_out = gr.Textbox(label="Koherent övningsmening")
|
320 |
-
|
321 |
-
# --- Knapparnas click‐kopplingar ---
|
322 |
-
run_btn.click(
|
323 |
-
fn=process,
|
324 |
-
inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
|
325 |
-
outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
|
326 |
-
)
|
327 |
-
coherent_btn.click(
|
328 |
-
fn=coherent_fn,
|
329 |
-
inputs=[user_input, lang_dd, manual_input, num_slider],
|
330 |
-
outputs=[coherent_out]
|
331 |
-
)
|
332 |
-
#Make sure to change language for the textfile to be analyzed in its target language
|
333 |
-
|
334 |
-
# --- Start app ---
|
335 |
-
if __name__ == "__main__":
|
336 |
-
url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
|
337 |
-
print("Appen körs på:", url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|