Spaces:

nevisende
/

word-analyzer

Build error

App Files Files Community

nevisende commited on Aug 11, 2024

Commit

12c6dc9

1 Parent(s): 2511227

Feat: make response sentence base

Browse files

Files changed (3) hide show

.idea/.gitignore +8 -0
.idea/word-analyzer.iml +12 -0
app.py +52 -51

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/word-analyzer.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="hf2" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from nltk.tokenize import word_tokenize
 from nltk.tag import pos_tag
 import gradio as gr
 load_dotenv()
 # Configuration
@@ -42,6 +43,7 @@ except IOError:
     spacy.cli.download(CONFIG['SPACY_MODEL'])
     nlp = spacy.load(CONFIG['SPACY_MODEL'])
 def get_wordnet_pos(treebank_tag):
     """Map POS tag to first character used by WordNet."""
     tag_map = {
@@ -49,6 +51,7 @@ def get_wordnet_pos(treebank_tag):
     }
     return tag_map.get(treebank_tag[0], None)
 def lesk_algorithm(word, sentence, pos=None):
     """Implement the Lesk algorithm for word sense disambiguation."""
     word = word.lower()
@@ -69,79 +72,76 @@ def lesk_algorithm(word, sentence, pos=None):
     return best_sense
-def create_unique_index(word, meaning, sentence):
     """Create a unique index for each word-meaning pair."""
-    combined = f"{word}_{meaning}_{sentence}".encode('utf-8')
     return hashlib.md5(combined).hexdigest()
 def is_meaningful_word(token):
     """Check if a word is meaningful and should be included in the analysis."""
-    return (token.has_vector and  # This ensures the word is in spaCy's vocabulary
-            not token.is_stop and  # Exclude stop words
             token.pos_ not in ['PUNCT', 'SYM', 'X'] and  # Exclude punctuation, symbols, and other
             len(token.text) > 1)  # Exclude single-character tokens
 def process_sentence(sent):
-    """Process a single sentence and return word information."""
-    word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
     doc = nlp(sent)
     for token in doc:
-        if is_meaningful_word(token):
             word = token.text.lower()
             wordnet_pos = get_wordnet_pos(token.tag_)
-            if not word_info[word]["lemma"]:
-                word_info[word]["lemma"] = token.lemma_
             best_sense = lesk_algorithm(word, sent, wordnet_pos)
             if best_sense:
                 definition = best_sense.definition()
                 pos = best_sense.pos()
-                unique_index = create_unique_index(word, definition, sent)
-                new_meaning = {
                     "index": unique_index,
                     "meaning": definition,
-                    "POS": pos,
-                    "sentence": sent
-                }
-                if not any(m['meaning'] == definition for m in word_info[word]["meanings"]):
-                    word_info[word]["meanings"].append(new_meaning)
-    return dict(word_info)
 def get_word_info(text):
-    """Get word information for all sentences in the text."""
     sentences = nltk.sent_tokenize(text)
-    word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
-    with ProcessPoolExecutor() as executor:
-        future_to_sentence = {executor.submit(process_sentence, sent): sent for sent in sentences}
-        for future in as_completed(future_to_sentence):
-            sentence_info = future.result()
-            for word, info in sentence_info.items():
-                word_info[word]["lemma"] = info["lemma"]
-                word_info[word]["meanings"].extend(info["meanings"])
-    # If a word has no meanings, try to get a default definition
-    for word, info in word_info.items():
-        if not info["meanings"]:
-            synsets = wn.synsets(word)
-            if synsets:
-                definition = synsets[0].definition()
-                pos = synsets[0].pos()
-                info["meanings"].append({
-                    "index": create_unique_index(word, definition, ""),
-                    "meaning": definition,
-                    "POS": pos,
-                    "sentence": "Default definition"
-                })
-    return dict(word_info)
 def process_text(selected_text, user_text):
     """Process the input text and return JSON results."""
@@ -153,6 +153,7 @@ def process_text(selected_text, user_text):
         logger.error(f"Error processing text: {str(e)}")
         return json.dumps({"error": "An error occurred while processing the text."})
 # Sample texts
 examples = [
     "The chef will season the steak with salt and pepper before grilling. Pumpkin spice lattes usually season the arrival of autumn.",

 from nltk.tag import pos_tag
 import gradio as gr
+nltk.download('punkt_tab')
 load_dotenv()
 # Configuration
     spacy.cli.download(CONFIG['SPACY_MODEL'])
     nlp = spacy.load(CONFIG['SPACY_MODEL'])
 def get_wordnet_pos(treebank_tag):
     """Map POS tag to first character used by WordNet."""
     tag_map = {
     }
     return tag_map.get(treebank_tag[0], None)
 def lesk_algorithm(word, sentence, pos=None):
     """Implement the Lesk algorithm for word sense disambiguation."""
     word = word.lower()
     return best_sense
+def create_unique_index(word, meaning):
     """Create a unique index for each word-meaning pair."""
+    combined = f"{word}_{meaning}".encode('utf-8')
     return hashlib.md5(combined).hexdigest()
 def is_meaningful_word(token):
     """Check if a word is meaningful and should be included in the analysis."""
+    return (not token.is_stop and  # Exclude stop words
             token.pos_ not in ['PUNCT', 'SYM', 'X'] and  # Exclude punctuation, symbols, and other
             len(token.text) > 1)  # Exclude single-character tokens
 def process_sentence(sent):
+    """Process a single sentence and return word information in order."""
+    word_info = []
     doc = nlp(sent)
     for token in doc:
+        if token.is_punct:
+            word_info.append({
+                "original": token.text,
+                "type": "punctuation"
+            })
+        elif token.is_space:
+            word_info.append({
+                "original": token.text,
+                "type": "space"
+            })
+        else:
             word = token.text.lower()
             wordnet_pos = get_wordnet_pos(token.tag_)
             best_sense = lesk_algorithm(word, sent, wordnet_pos)
             if best_sense:
                 definition = best_sense.definition()
                 pos = best_sense.pos()
+                unique_index = create_unique_index(word, definition)
+                word_info.append({
+                    "original": token.text,
+                    "lemma": token.lemma_,
                     "index": unique_index,
                     "meaning": definition,
+                    "POS": pos
+                })
+            else:
+                word_info.append({
+                    "original": token.text,
+                    "type": "unknown"
+                })
+    return word_info
 def get_word_info(text):
+    """Get word information for all sentences in the text, preserving sentence structure."""
     sentences = nltk.sent_tokenize(text)
+    all_word_info = []
+    for sent in sentences:
+        sentence_info = process_sentence(sent)
+        all_word_info.append(sentence_info)
+    return all_word_info
 def process_text(selected_text, user_text):
     """Process the input text and return JSON results."""
         logger.error(f"Error processing text: {str(e)}")
         return json.dumps({"error": "An error occurred while processing the text."})
 # Sample texts
 examples = [
     "The chef will season the steak with salt and pepper before grilling. Pumpkin spice lattes usually season the arrival of autumn.",