Update app.py
Browse files
app.py
CHANGED
@@ -29,6 +29,23 @@ from tokenizers import Tokenizer
|
|
29 |
from tokenizers.models import BPE
|
30 |
from tokenizers.trainers import BpeTrainer
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
nltk.download('stopwords', quiet=True)
|
33 |
nltk.download('punkt', quiet=True)
|
34 |
|
@@ -50,6 +67,10 @@ MODELS = {
|
|
50 |
}
|
51 |
}
|
52 |
|
|
|
|
|
|
|
|
|
53 |
def preprocess_text(text, lang='german'):
|
54 |
# Convert to lowercase
|
55 |
text = text.lower()
|
@@ -58,15 +79,26 @@ def preprocess_text(text, lang='german'):
|
|
58 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
59 |
|
60 |
# Tokenize
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Remove stopwords
|
64 |
-
|
|
|
|
|
|
|
|
|
65 |
tokens = [token for token in tokens if token not in stop_words]
|
66 |
|
67 |
# Stemming
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
70 |
|
71 |
return ' '.join(tokens)
|
72 |
|
|
|
29 |
from tokenizers.models import BPE
|
30 |
from tokenizers.trainers import BpeTrainer
|
31 |
|
32 |
+
|
33 |
+
|
34 |
+
def download_nltk_resources():
|
35 |
+
resources = [
|
36 |
+
'punkt',
|
37 |
+
'stopwords',
|
38 |
+
'snowball_data',
|
39 |
+
]
|
40 |
+
for resource in resources:
|
41 |
+
try:
|
42 |
+
nltk.download(resource, quiet=True)
|
43 |
+
except Exception as e:
|
44 |
+
print(f"Failed to download {resource}: {str(e)}")
|
45 |
+
|
46 |
+
download_nltk_resources()
|
47 |
+
|
48 |
+
|
49 |
nltk.download('stopwords', quiet=True)
|
50 |
nltk.download('punkt', quiet=True)
|
51 |
|
|
|
67 |
}
|
68 |
}
|
69 |
|
70 |
+
def simple_tokenize(text):
|
71 |
+
"""Simple tokenization fallback method."""
|
72 |
+
return text.split()
|
73 |
+
|
74 |
def preprocess_text(text, lang='german'):
|
75 |
# Convert to lowercase
|
76 |
text = text.lower()
|
|
|
79 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
80 |
|
81 |
# Tokenize
|
82 |
+
try:
|
83 |
+
tokens = word_tokenize(text, language=lang)
|
84 |
+
except LookupError:
|
85 |
+
print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
|
86 |
+
tokens = simple_tokenize(text)
|
87 |
|
88 |
# Remove stopwords
|
89 |
+
try:
|
90 |
+
stop_words = set(stopwords.words(lang))
|
91 |
+
except LookupError:
|
92 |
+
print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
|
93 |
+
stop_words = set()
|
94 |
tokens = [token for token in tokens if token not in stop_words]
|
95 |
|
96 |
# Stemming
|
97 |
+
try:
|
98 |
+
stemmer = SnowballStemmer(lang)
|
99 |
+
tokens = [stemmer.stem(token) for token in tokens]
|
100 |
+
except ValueError:
|
101 |
+
print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
|
102 |
|
103 |
return ' '.join(tokens)
|
104 |
|