JacobWP commited on
Commit
45baf3e
·
verified ·
1 Parent(s): 4ed530f

Delete app.py 2.txt

Browse files
Files changed (1) hide show
  1. app.py 2.txt +0 -337
app.py 2.txt DELETED
@@ -1,337 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Created on Mon May 19 16:49:22 2025
5
-
6
- @author: jacobwildt-persson
7
- """
8
-
9
- #!/usr/bin/env python3
10
- # -*- coding: utf-8 -*-
11
- # -----------------------------------------------
12
- # Requirements & Setup Instructions
13
- # -----------------------------------------------
14
-
15
- # Python version:
16
- # Requires Python 3.10 or later (tested on 3.12)
17
-
18
-
19
- # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
20
- # Recreate the environment with theese command in terminal
21
- # conda env create -f environment.yml
22
- # conda activate sprakenv
23
- #
24
-
25
- # Install all required packages:
26
- # Run these commands in the terminal:
27
-
28
- # pip install --upgrade gradio
29
- # pip install pdfplumber
30
- # pip install nltk
31
- # pip install transformers
32
- # pip install -U spacy
33
-
34
- # Download language models:
35
- # python -m spacy download es_core_news_lg
36
- # python -m spacy download en_core_web_lg # if you add NER for English
37
-
38
- # Check Gradio version used:
39
- # import gradio as gr
40
- # print(gr.__version__) # Gradio version 4.18.0
41
-
42
- # 🔗 Reference: Gradio Quickstart Guide
43
- # https://www.gradio.app/guides/quickstart
44
- #Hugging Face
45
- # https://huggingface.co/models
46
-
47
- # Enghlish API model
48
- # LanguageTool API: https://languagetool.org/http-api/swagger
49
-
50
-
51
-
52
- #Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
53
- # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
54
- # Recreate the environment with theese command in terminal
55
- # conda env create -f environment.yml
56
- # conda activate sprakenv
57
- # python -m spacy download es_core_news_lg
58
- #python -m nltk.downloader punkt wordnet
59
- # -----------------------------------------------
60
- """
61
- Language learning app with Gradio UI, on & multiple users:
62
- - Import text from file (.txt/.csv/.pdf) or manual text input
63
- - Grammar correction via transformers (Spanish) or LanguageTool API (English)
64
- - Analyze text (known/unknown words) per user & language
65
- - Save unknown words as known
66
- - Generate coherent practice sentence (Spanish & English)
67
- - Log grammar corrections and practice sentence suggestions to CSV
68
- """
69
- import os
70
- import datetime
71
- import sqlite3
72
- import requests
73
- import random
74
- import pandas as pd
75
- import pdfplumber
76
- import spacy
77
- import csv
78
- # SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
79
- import sqlite3
80
-
81
- from nltk.tokenize import word_tokenize
82
- from nltk.stem import WordNetLemmatizer
83
- from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
84
- import gradio as gr
85
- import gradio_client.utils as _gcu
86
-
87
- # --- PATCH for Gradio utils schema bug ---
88
- _orig_json = _gcu.json_schema_to_python_type
89
- _orig_get = _gcu.get_type
90
-
91
- def _patched_json_to_py(schema, defs=None):
92
- if not isinstance(schema, dict):
93
- return "any"
94
- try:
95
- return _orig_json(schema, defs)
96
- except Exception:
97
- return "any"
98
-
99
- def _patched_get_type(schema):
100
- if not isinstance(schema, dict):
101
- return "any"
102
- try:
103
- return _orig_get(schema)
104
- except Exception:
105
- return "any"
106
-
107
- _gcu.json_schema_to_python_type = _patched_json_to_py
108
- _gcu.get_type = _patched_get_type
109
-
110
- # --- SQLite Database initialization ---
111
- DB_NAME = "vocabulary.db"
112
- conn = sqlite3.connect(DB_NAME)
113
- conn.execute("""
114
- CREATE TABLE IF NOT EXISTS vocabulary (
115
- user_id TEXT,
116
- language TEXT,
117
- word TEXT,
118
- timestamp TEXT,
119
- UNIQUE(user_id, language, word)
120
- )
121
- """)
122
- conn.commit()
123
- conn.close()
124
-
125
- # --- Save word to database ---
126
- def save_word_to_db(user_id: str, language: str, word: str):
127
- ts = datetime.datetime.now().isoformat()
128
- conn = sqlite3.connect(DB_NAME)
129
- conn.execute(
130
- "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
131
- (user_id, language, word, ts)
132
- )
133
- conn.commit()
134
- conn.close()
135
-
136
- # --- Retrieve known words for user/language ---
137
- def get_user_vocabulary(user_id: str, language: str) -> set[str]:
138
- conn = sqlite3.connect(DB_NAME)
139
- rows = conn.execute(
140
- "SELECT word FROM vocabulary WHERE user_id=? AND language=?",
141
- (user_id, language)
142
- ).fetchall()
143
- conn.close()
144
- return {r[0] for r in rows}
145
-
146
- # --- Load NLP models ---
147
- nlp = spacy.load("es_core_news_lg")
148
- tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
149
- model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
150
- gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
151
- gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
152
- gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
153
- gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2")
154
- lemmatizer = WordNetLemmatizer()
155
-
156
- # ---Log to CSV (grammar corrections and sentence suggestions) ---
157
- def log_to_csv(filename, row, fieldnames):
158
- file_exists = os.path.isfile(filename)
159
- with open(filename, "a", newline='', encoding="utf-8") as csvfile:
160
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
161
- if not file_exists:
162
- writer.writeheader()
163
- writer.writerow(row)
164
-
165
- # --- File Import ---
166
- def import_file(path: str) -> str:
167
- ext = os.path.splitext(path)[1].lower()
168
- if ext == ".pdf":
169
- pages = []
170
- with pdfplumber.open(path) as pdf:
171
- for p in pdf.pages:
172
- pages.append(p.extract_text() or "")
173
- return "\n".join(pages)
174
- if ext == ".csv":
175
- df = pd.read_csv(path)
176
- if "text" in df:
177
- return "\n".join(df["text"].astype(str))
178
- raise ValueError("CSV saknar kolumnen 'text'.")
179
- if ext == ".txt":
180
- return open(path, encoding="utf-8").read()
181
- raise ValueError(f"Okänt filformat: {ext}")
182
-
183
- # --- Grammar Correction ---
184
-
185
- def correct_grammar(text: str, language: str) -> str:
186
- if language == "es":
187
- corrected = []
188
- for sent in nlp(text).sents:
189
- s = sent.text.strip()
190
- if not s: continue
191
- inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
192
- out = model.generate(
193
- **inp,
194
- max_new_tokens=inp.input_ids.shape[1],
195
- num_beams=5,
196
- early_stopping=True
197
- )
198
- corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
199
- return " ".join(corrected)
200
- # English: LanguageTool API
201
- resp = requests.post(
202
- "https://api.languagetool.org/v2/check",
203
- data={"text": text, "language": language}
204
- ).json()
205
- for m in reversed(resp.get("matches", [])):
206
- off, ln = m["offset"], m["length"]
207
- repls = m.get("replacements", [])
208
- val = repls[0]["value"] if repls else ""
209
- text = text[:off] + val + text[off+ln:]
210
- return text
211
-
212
- # --- Analyze known and unknown words ---
213
-
214
- def analyze_text(text: str, user_id: str, language: str):
215
- toks = word_tokenize(text)
216
- lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
217
- vocab = get_user_vocabulary(user_id, language)
218
- known = [w for w in lems if w in vocab]
219
- unknown = [w for w in lems if w not in vocab]
220
- return known, unknown
221
- # --- Generate sentence using GPT2 based on unknown words ---
222
- def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
223
- kn, un = analyze_text(text, user_id, language)
224
- if not un:
225
- return "Inga okända ord att generera mening med."
226
- chosen = random.sample(un, min(num_unknown, len(un)))
227
- if language == "es":
228
- prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
229
- tokenizer = gpt2_tokenizer_es
230
- model = gpt2_model_es
231
- else:
232
- prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
233
- tokenizer = gpt2_tokenizer_en
234
- model = gpt2_model_en
235
- inp = tokenizer(prompt, return_tensors="pt", truncation=True)
236
- outs = model.generate(
237
- **inp,
238
- max_new_tokens=50,
239
- do_sample=True,
240
- top_k=50,
241
- top_p=0.95
242
- )
243
- gen = tokenizer.decode(outs[0], skip_special_tokens=True)
244
- body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
245
- sentence = (body.split(".")[0].strip() + ".") if "." in body else body
246
- if not any(c.isalpha() for c in sentence):
247
- return "Misslyckades att generera meningsfull övningsmening."
248
- return sentence
249
-
250
-
251
- # --- Gradio process callback ---
252
- def process(user, language, txt, file, do_grammar, do_save):
253
- try:
254
- if txt and txt.strip():
255
- text = txt.strip()
256
- elif file:
257
- text = import_file(file.name)
258
- else:
259
- return "", "", "", "Ingen text angiven.", ""
260
- out = correct_grammar(text, language) if do_grammar else text
261
- kn, un = analyze_text(out, user, language)
262
- status = ""
263
- if do_save and un:
264
- for w in un:
265
- save_word_to_db(user, language, w)
266
- status = f"Sparade {len(un)} ord."
267
- # Logga grammatikrättning till CSV
268
- log_to_csv(
269
- "grammarlog.csv",
270
- {
271
- "user": user, "language": language, "input": text,
272
- "output": out, "timestamp": datetime.datetime.now().isoformat()
273
- },
274
- ["user", "language", "input", "output", "timestamp"]
275
- )
276
- return out, ", ".join(kn), ", ".join(un), status, ""
277
- except Exception as e:
278
- import traceback
279
- tb = traceback.format_exc()
280
- return "", "", "", f"FEL i process:\n{tb}", ""
281
-
282
- # --- Sentence generation callback ---
283
- def coherent_fn(user, language, txt, num):
284
- try:
285
- suggestion = generate_coherent_sentence(txt or "", user, language, num)
286
- # Logga övningsförslag till CSV
287
- log_to_csv(
288
- "sentencelog.csv",
289
- {
290
- "user": user, "language": language, "input": txt,
291
- "output": suggestion, "timestamp": datetime.datetime.now().isoformat()
292
- },
293
- ["user", "language", "input", "output", "timestamp"]
294
- )
295
- return suggestion
296
- except Exception as e:
297
- return f"Fel vid generering: {e}"
298
-
299
- # --- Gradio UI ---
300
- demo = gr.Blocks()
301
- with demo:
302
- gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
303
- with gr.Row():
304
- user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
305
- lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
306
- with gr.Column():
307
- manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
308
- file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
309
- grammar_cb = gr.Checkbox(label="Grammatik­rättning")
310
- autosave_cb = gr.Checkbox(label="Spara okända ord")
311
- run_btn = gr.Button("Kör analys & korrigering")
312
- num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
313
- coherent_btn = gr.Button("Koherent övningsmening")
314
-
315
- corr_out = gr.Textbox(label="Korrigerad text", lines=4)
316
- known_out = gr.Textbox(label="Kända ord")
317
- unknown_out = gr.Textbox(label="Okända ord")
318
- status_out = gr.Textbox(label="Status")
319
- coherent_out = gr.Textbox(label="Koherent övningsmening")
320
-
321
- # --- Knapparnas click‐kopplingar ---
322
- run_btn.click(
323
- fn=process,
324
- inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
325
- outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
326
- )
327
- coherent_btn.click(
328
- fn=coherent_fn,
329
- inputs=[user_input, lang_dd, manual_input, num_slider],
330
- outputs=[coherent_out]
331
- )
332
- #Make sure to change language for the textfile to be analyzed in its target language
333
-
334
- # --- Start app ---
335
- if __name__ == "__main__":
336
- url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
337
- print("Appen körs på:", url)