shivakerur99 commited on
Commit
1387d41
1 Parent(s): a1a449a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +12 -14
main.py CHANGED
@@ -6,24 +6,25 @@ from fastapi.middleware.cors import CORSMiddleware
6
  from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String
7
  from databases import Database
8
  from textblob import TextBlob
9
- import os
10
 
11
  import whisperx
12
  import gc
13
- import re
14
- import openai
15
- import time
16
 
17
- import spacy
18
-
19
- # Load the English tokenizer, tagger, parser, NER, and word vectors
20
- nlp = spacy.load("en_core_web_sm")
21
 
 
 
 
 
 
 
 
22
 
23
  openai.api_key = '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'
24
 
25
  app = FastAPI()
26
 
 
 
27
  import requests
28
  import json
29
 
@@ -90,12 +91,9 @@ def parse_conversation(content):
90
 
91
 
92
  def extract_active_words(text):
93
- # Process the text with spaCy
94
- doc = nlp(text)
95
-
96
- # Extract tokens that are not stopwords
97
- active_words = [re.sub(r'[^\w\s]', '', token.text) for token in doc if not token.is_stop and not token.is_digit and not token.is_punct and token.text != "Speaker"]
98
-
99
  return active_words
100
 
101
 
 
6
  from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String
7
  from databases import Database
8
  from textblob import TextBlob
 
9
 
10
  import whisperx
11
  import gc
 
 
 
12
 
 
 
 
 
13
 
14
+ import nltk
15
+ from nltk.tokenize import word_tokenize
16
+ from nltk.corpus import stopwords
17
+ import openai
18
+ import time
19
+ nltk.download('punkt')
20
+ nltk.download('stopwords')
21
 
22
  openai.api_key = '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'
23
 
24
  app = FastAPI()
25
 
26
+ import os
27
+
28
  import requests
29
  import json
30
 
 
91
 
92
 
93
  def extract_active_words(text):
94
+ tokens = word_tokenize(text)
95
+ stop_words = set(stopwords.words('english'))
96
+ active_words = [word for word in tokens if word.isalnum() and word.lower() not in stop_words]
 
 
 
97
  return active_words
98
 
99