BilalSardar commited on
Commit
7567b06
1 Parent(s): 788794e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -4
app.py CHANGED
@@ -7,6 +7,7 @@ import nltk
7
  from nltk.stem import WordNetLemmatizer
8
  from nltk.tokenize import word_tokenize
9
  from nltk.corpus import wordnet
 
10
 
11
  nltk.download('punkt')
12
  nltk.download('wordnet')
@@ -22,22 +23,34 @@ def get_wordnet_pos(tag):
22
  elif tag.startswith('R'):
23
  return wordnet.ADV
24
  else:
25
- return wordnet.NOUN # Default to noun if the POS tag is not found
 
26
 
27
  def get_lemma(word):
 
 
28
  lemmatizer = WordNetLemmatizer()
29
 
 
 
 
 
 
 
 
30
  tokens = word_tokenize(word)
31
- tagged_words = nltk.pos_tag(tokens)
32
  lemmas = []
33
  for tagged_word in tagged_words:
34
  word = tagged_word[0]
35
  pos = tagged_word[1]
36
  wordnet_pos = get_wordnet_pos(pos)
37
- lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
 
 
 
38
  lemmas.append(lemma)
39
  return ' '.join(lemmas)
40
-
41
  def apply_lemma_to_string(sentence):
42
  words = word_tokenize(sentence)
43
  lemmas = [get_lemma(word) for word in words]
 
7
  from nltk.stem import WordNetLemmatizer
8
  from nltk.tokenize import word_tokenize
9
  from nltk.corpus import wordnet
10
+ from nltk.tag import pos_tag
11
 
12
  nltk.download('punkt')
13
  nltk.download('wordnet')
 
23
  elif tag.startswith('R'):
24
  return wordnet.ADV
25
  else:
26
+ return wordnet.NOUN
27
+
28
 
29
  def get_lemma(word):
30
+ nltk.download('averaged_perceptron_tagger')
31
+ nltk.download('wordnet')
32
  lemmatizer = WordNetLemmatizer()
33
 
34
+ exceptions = {
35
+ 'are': 'are', # Preserve 'are' as-is
36
+ 'have': 'have', # Preserve 'have' as-is
37
+ 'do': 'do', # Preserve 'do' as-is
38
+ 'am':'am'
39
+ }
40
+
41
  tokens = word_tokenize(word)
42
+ tagged_words = pos_tag(tokens)
43
  lemmas = []
44
  for tagged_word in tagged_words:
45
  word = tagged_word[0]
46
  pos = tagged_word[1]
47
  wordnet_pos = get_wordnet_pos(pos)
48
+ if word in exceptions:
49
+ lemma = exceptions[word]
50
+ else:
51
+ lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
52
  lemmas.append(lemma)
53
  return ' '.join(lemmas)
 
54
  def apply_lemma_to_string(sentence):
55
  words = word_tokenize(sentence)
56
  lemmas = [get_lemma(word) for word in words]