ayush-thakur02 commited on
Commit
5227ac1
1 Parent(s): d6f86e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from nltk import word_tokenize
5
+ from nltk.stem import WordNetLemmatizer
6
+ from nltk.corpus import stopwords
7
+ import nltk
8
+ import json
9
+
10
+ # Download NLTK resources
11
+ nltk.download('punkt')
12
+ nltk.download('wordnet')
13
+ nltk.download('stopwords')
14
+
15
+ def preprocess(sentence):
16
+ lemmatizer = WordNetLemmatizer()
17
+ stop_words = set(stopwords.words('english'))
18
+ tokens = word_tokenize(sentence.lower())
19
+ tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum()]
20
+ tokens = [word for word in tokens if word not in stop_words]
21
+ return ' '.join(tokens)
22
+
23
+ def find_most_similar(sentence, candidates, threshold=0.15):
24
+ input_bits = preprocess(sentence)
25
+ chunks = [preprocess(candidate) for candidate in candidates]
26
+
27
+ vectorizer = TfidfVectorizer()
28
+ vectors = vectorizer.fit_transform([input_bits] + chunks)
29
+
30
+ similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
31
+
32
+ similar_sentences = []
33
+ for i, score in enumerate(similarity_scores):
34
+ if score >= threshold:
35
+ similar_sentences.append({"sentence": candidates[i], "similarity_score": round(score, 4)})
36
+
37
+ return similar_sentences
38
+
39
+ def read_sentences_from_file(file_location):
40
+ with open(file_location, 'r') as file:
41
+ text = file.read().replace('\n', ' ')
42
+ sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
43
+ return sentences
44
+
45
+ def fetch_vectors(file, sentence):
46
+ file_location = file.name
47
+ chunks = read_sentences_from_file(file_location)
48
+ similar_sentences = find_most_similar(sentence, chunks, threshold=0.15)
49
+ return json.dumps(similar_sentences, indent=4)
50
+
51
+ # Interface
52
+ file_uploader = gr.File(label="Upload a .txt file")
53
+ text_input = gr.Textbox(label="Enter a sentence")
54
+ output_text = gr.Textbox(label="Similar Sentences JSON")
55
+
56
+ iface = gr.Interface(
57
+ fn=fetch_vectors,
58
+ inputs=[file_uploader, text_input],
59
+ outputs=output_text,
60
+ title="Simple RAG - For QA",
61
+ description="Upload a text file and enter the question. The threshold is set to 0.15."
62
+ )
63
+
64
+ iface.launch(debug=True)