alexkueck commited on
Commit
28815ea
1 Parent(s): 5301d93

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +5 -4
utils.py CHANGED
@@ -63,12 +63,15 @@ import nltk
63
  from nltk.corpus import stopwords
64
  from nltk.tokenize import word_tokenize
65
  from nltk.stem import WordNetLemmatizer
66
- nltk.download('punkt')
67
 
68
  from sklearn.feature_extraction.text import TfidfVectorizer
69
  from sklearn.metrics.pairwise import cosine_similarity
70
  import numpy as np
71
 
 
 
 
 
72
 
73
 
74
  ################################################
@@ -109,9 +112,7 @@ def normalise_prompt (prompt):
109
  tokens = [word for word in tokens if word.isalnum()]
110
 
111
  # Stop Word Entfernung
112
- nltk.download('stopwords')
113
- stop_words = set(stopwords.words('deutsch'))
114
- tokens = [word for word in tokens if not word in stop_words]
115
  # 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
116
  nltk.download('wordnet')
117
  lemmatizer = WordNetLemmatizer()
 
63
  from nltk.corpus import stopwords
64
  from nltk.tokenize import word_tokenize
65
  from nltk.stem import WordNetLemmatizer
 
66
 
67
  from sklearn.feature_extraction.text import TfidfVectorizer
68
  from sklearn.metrics.pairwise import cosine_similarity
69
  import numpy as np
70
 
71
+ #für die Normalisierung
72
+ nltk.download('punkt')
73
+ nltk.download('stopwords')
74
+ german_stopwords = set(stopwords.words('german'))
75
 
76
 
77
  ################################################
 
112
  tokens = [word for word in tokens if word.isalnum()]
113
 
114
  # Stop Word Entfernung
115
+ tokens = [word for word in tokens if not word in german_stopwords]
 
 
116
  # 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
117
  nltk.download('wordnet')
118
  lemmatizer = WordNetLemmatizer()