Chris4K commited on
Commit
9ace3c0
·
verified ·
1 Parent(s): 2824ed7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -4
app.py CHANGED
@@ -29,6 +29,23 @@ from tokenizers import Tokenizer
29
  from tokenizers.models import BPE
30
  from tokenizers.trainers import BpeTrainer
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  nltk.download('stopwords', quiet=True)
33
  nltk.download('punkt', quiet=True)
34
 
@@ -50,6 +67,10 @@ MODELS = {
50
  }
51
  }
52
 
 
 
 
 
53
  def preprocess_text(text, lang='german'):
54
  # Convert to lowercase
55
  text = text.lower()
@@ -58,15 +79,26 @@ def preprocess_text(text, lang='german'):
58
  text = re.sub(r'[^a-zA-Z\s]', '', text)
59
 
60
  # Tokenize
61
- tokens = word_tokenize(text, language=lang)
 
 
 
 
62
 
63
  # Remove stopwords
64
- stop_words = set(stopwords.words(lang))
 
 
 
 
65
  tokens = [token for token in tokens if token not in stop_words]
66
 
67
  # Stemming
68
- stemmer = SnowballStemmer(lang)
69
- tokens = [stemmer.stem(token) for token in tokens]
 
 
 
70
 
71
  return ' '.join(tokens)
72
 
 
29
  from tokenizers.models import BPE
30
  from tokenizers.trainers import BpeTrainer
31
 
32
+
33
+
34
+ def download_nltk_resources():
35
+ resources = [
36
+ 'punkt',
37
+ 'stopwords',
38
+ 'snowball_data',
39
+ ]
40
+ for resource in resources:
41
+ try:
42
+ nltk.download(resource, quiet=True)
43
+ except Exception as e:
44
+ print(f"Failed to download {resource}: {str(e)}")
45
+
46
+ download_nltk_resources()
47
+
48
+
49
  nltk.download('stopwords', quiet=True)
50
  nltk.download('punkt', quiet=True)
51
 
 
67
  }
68
  }
69
 
70
+ def simple_tokenize(text):
71
+ """Simple tokenization fallback method."""
72
+ return text.split()
73
+
74
  def preprocess_text(text, lang='german'):
75
  # Convert to lowercase
76
  text = text.lower()
 
79
  text = re.sub(r'[^a-zA-Z\s]', '', text)
80
 
81
  # Tokenize
82
+ try:
83
+ tokens = word_tokenize(text, language=lang)
84
+ except LookupError:
85
+ print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
86
+ tokens = simple_tokenize(text)
87
 
88
  # Remove stopwords
89
+ try:
90
+ stop_words = set(stopwords.words(lang))
91
+ except LookupError:
92
+ print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
93
+ stop_words = set()
94
  tokens = [token for token in tokens if token not in stop_words]
95
 
96
  # Stemming
97
+ try:
98
+ stemmer = SnowballStemmer(lang)
99
+ tokens = [stemmer.stem(token) for token in tokens]
100
+ except ValueError:
101
+ print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
102
 
103
  return ' '.join(tokens)
104