rockerritesh commited on
Commit
9e484af
·
verified ·
1 Parent(s): 3600617

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -3
app.py CHANGED
@@ -12,11 +12,67 @@ import nltk
12
  # # English words from NLTK corpus
13
  # english_words = set(nltk.corpus.words.words())
14
 
15
- with open("index.dic") as f:
16
- hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def is_english_word(word):
19
- return word.lower() in hunspell_words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  # Define Devanagari digits and patterns for matching
 
12
  # # English words from NLTK corpus
13
  # english_words = set(nltk.corpus.words.words())
14
 
15
+ # with open("index.dic") as f:
16
+ # hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}
17
+
18
+ # def is_english_word(word):
19
+ # return word.lower() in hunspell_words
20
+
21
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
22
+ from nltk.corpus import words, wordnet
23
+ import spacy
24
+ from spellchecker import SpellChecker
25
+ import string
26
+
27
+ # Download necessary NLTK resources
28
+ nltk.download('wordnet')
29
+ nltk.download('words')
30
+
31
+ # Initialize tools
32
+ lemmatizer = WordNetLemmatizer()
33
+ stemmer = PorterStemmer()
34
+ english_words = set(words.words())
35
+ nlp = spacy.load("en_core_web_sm") # SpaCy language model
36
+ spell = SpellChecker() # Spell checker
37
+
38
+ # Combine dictionaries for better coverage
39
+ combined_dictionary = english_words.union(spell.word_frequency.keys())
40
 
41
  def is_english_word(word):
42
+ """
43
+ Checks if a word is English and returns the valid English word or None if not recognized.
44
+ """
45
+ # Preprocess the word: strip punctuation and lowercase
46
+ word_cleaned = word.lower().strip(string.punctuation)
47
+ if not word_cleaned:
48
+ return None
49
+
50
+ # 1. Direct dictionary match
51
+ if word_cleaned in combined_dictionary:
52
+ return word_cleaned
53
+
54
+ # 2. Lemmatization
55
+ lemma = lemmatizer.lemmatize(word_cleaned)
56
+ if lemma in combined_dictionary:
57
+ return lemma
58
+
59
+ # 3. Stemming
60
+ stem = stemmer.stem(word_cleaned)
61
+ if stem in combined_dictionary:
62
+ return stem
63
+
64
+ # 4. Spell checker
65
+ corrected_word = spell.correction(word_cleaned)
66
+ if corrected_word in combined_dictionary:
67
+ return corrected_word
68
+
69
+ # 5. SpaCy's language model (check if token is recognized as English)
70
+ doc = nlp(word_cleaned)
71
+ if doc and doc[0].is_alpha and doc[0].lang_ == "en":
72
+ return word_cleaned
73
+
74
+ return None
75
+
76
 
77
 
78
  # Define Devanagari digits and patterns for matching