Anupam251272's picture
Create app.py
5bf2fd4 verified
raw
history blame
13.7 kB
import torch
import pandas as pd
import networkx as nx
import pdfplumber
import gradio as gr
from transformers import pipeline, MBartTokenizer, MBartForConditionalGeneration
from sentence_transformers import SentenceTransformer
import re
from typing import List, Dict, Optional
class MultilingualAyurvedicRecommender:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
# Initialize multilingual question-answering model
self.qa_model = pipeline(
"question-answering",
model="deepset/xlm-roberta-large-squad2",
device=0 if self.device == "cuda" else -1
)
# Initialize multilingual sentence transformer
self.similarity_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
self.similarity_model.to(self.device)
# Initialize translation models
print("Loading translation models...")
self.translation_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
self.translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
self.translation_model.to(self.device)
# Language codes for supported Indian languages
self.language_codes = {
"en_XX": "English",
"hi_IN": "Hindi",
"te_IN": "Telugu",
"ta_IN": "Tamil",
"mr_IN": "Marathi",
"gu_IN": "Gujarati",
"bn_IN": "Bengali"
}
self.G = nx.Graph()
def detect_language(self, text: str) -> str:
"""
Detect the language of input text
Args:
text: Input text to detect language for
Returns:
str: Detected language code
"""
try:
inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
lang_scores = self.translation_model(**inputs).logits[0]
detected_lang = self.translation_tokenizer.decode(torch.argmax(lang_scores))
return self.language_codes.get(detected_lang, "en_XX")
except Exception as e:
print(f"Language detection error: {e}")
return "en_XX"
def translate_text(self, text: str, target_lang: str) -> str:
"""
Translate text to target language
Args:
text: Text to translate
target_lang: Target language code
Returns:
str: Translated text
"""
try:
source_lang = self.detect_language(text)
if source_lang == target_lang:
return text
inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
translated = self.translation_model.generate(
**inputs,
forced_bos_token_id=self.translation_tokenizer.lang_code_to_id[target_lang],
max_length=1024,
num_beams=4,
length_penalty=1.0
)
return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
except Exception as e:
print(f"Translation error: {e}")
return text
def extract_from_pdf(self, pdf_path: str) -> pd.DataFrame:
"""
Extract text from PDF and parse into structured format
Args:
pdf_path: Path to PDF file
Returns:
pd.DataFrame: Extracted medicine data
"""
medicines_data = {
"Medicine": [],
"Conditions": [],
"Remedies": []
}
try:
with pdfplumber.open(pdf_path) as pdf:
current_medicine = None
current_conditions = []
current_remedies = []
for page in pdf.pages:
text = page.extract_text()
# Skip non-content pages
if any(header in text.upper() for header in ["INSTRUCTIONS", "INDEX", "FOREWORD"]):
continue
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# Detect medicine headers
if re.match(r'^[A-Za-z\s]+\([A-Za-z\s]+\)', line):
if current_medicine and current_conditions:
medicines_data["Medicine"].append(current_medicine)
medicines_data["Conditions"].append(';'.join(set(current_conditions)))
medicines_data["Remedies"].append(';'.join(current_remedies))
current_medicine = line.split('(')[0].strip()
current_conditions = []
current_remedies = []
continue
if current_medicine:
# Detect remedy instructions
if re.search(r'\d+(?:\s*(?:gm|ml|times|drops|days))', line.lower()):
current_remedies.append(line)
# Detect conditions
elif any(condition in line.lower() for condition in [
'pain', 'ache', 'fever', 'cold', 'cough', 'diabetes',
'wounds', 'ulcer', 'skin', 'digestion', 'appetite'
]):
condition = line.split(':')[0] if ':' in line else line
current_conditions.append(condition)
# Add final medicine entry
if current_medicine and current_conditions:
medicines_data["Medicine"].append(current_medicine)
medicines_data["Conditions"].append(';'.join(set(current_conditions)))
medicines_data["Remedies"].append(';'.join(current_remedies))
df = pd.DataFrame(medicines_data)
return df[df['Conditions'].str.len() > 0].drop_duplicates()
except Exception as e:
print(f"Error processing PDF: {e}")
return pd.DataFrame()
def build_knowledge_graph(self, df: pd.DataFrame) -> None:
"""
Build knowledge graph from medicine data
Args:
df: DataFrame containing medicine data
"""
self.G.clear()
for _, row in df.iterrows():
medicine = row['Medicine']
conditions = row['Conditions'].split(';')
remedies = row['Remedies'].split(';')
self.G.add_node(medicine, type='medicine')
for condition in conditions:
condition = condition.strip()
if condition:
self.G.add_node(condition, type='condition')
self.G.add_edge(medicine, condition)
for remedy in remedies:
remedy = remedy.strip()
if remedy:
self.G.add_node(remedy, type='remedy', info=remedy)
self.G.add_edge(medicine, remedy)
def find_similar_conditions(self, symptoms: str, conditions: List[str]) -> List[tuple]:
"""
Find conditions similar to input symptoms
Args:
symptoms: Input symptoms text
conditions: List of known conditions
Returns:
List[tuple]: List of (condition, similarity_score) pairs
"""
symptoms_embedding = self.similarity_model.encode(symptoms, convert_to_tensor=True)
conditions_embeddings = self.similarity_model.encode(conditions, convert_to_tensor=True)
similarities = torch.nn.functional.cosine_similarity(
symptoms_embedding.unsqueeze(0),
conditions_embeddings,
dim=1
)
similar_conditions = [
(condition, float(similarity))
for condition, similarity in zip(conditions, similarities)
if similarity > 0.5
]
return sorted(similar_conditions, key=lambda x: x[1], reverse=True)
def recommend_medicines(self, symptoms: str, df: pd.DataFrame, target_lang: str = "en_XX") -> List[Dict]:
"""
Recommend medicines based on symptoms with language support
Args:
symptoms: Input symptoms text
df: DataFrame containing medicine data
target_lang: Target language code
Returns:
List[Dict]: List of recommendations
"""
english_symptoms = self.translate_text(symptoms, "en_XX")
all_conditions = [
c.strip() for conditions_list in df['Conditions'].str.split(';')
for c in conditions_list if c.strip()
]
all_conditions = list(set(all_conditions))
if not all_conditions:
return []
similar_conditions = self.find_similar_conditions(english_symptoms, all_conditions)
recommendations = []
for condition, confidence in similar_conditions:
medicines = [
n for n, attr in self.G.nodes(data=True)
if attr.get('type') == 'medicine' and self.G.has_edge(n, condition)
]
for medicine in medicines:
remedies = [
self.G.nodes[n]['info']
for n in self.G.neighbors(medicine)
if self.G.nodes[n]['type'] == 'remedy'
]
recommendations.append({
'medicine': self.translate_text(medicine, target_lang),
'condition': self.translate_text(condition, target_lang),
'confidence': confidence,
'remedies': [self.translate_text(remedy, target_lang) for remedy in remedies]
})
return sorted(recommendations, key=lambda x: x['confidence'], reverse=True)
def process_file_and_recommend(
self,
file: gr.File,
symptoms: str,
target_language: str = "English"
) -> str:
"""
Process input file and return recommendations in specified language
Args:
file: Uploaded PDF file
symptoms: Input symptoms text
target_language: Target language name
Returns:
str: Formatted recommendations text
"""
try:
target_lang = next(
(code for code, lang in self.language_codes.items()
if lang.lower() == target_language.lower()),
"en_XX"
)
df = self.extract_from_pdf(file.name)
if df.empty:
return self.translate_text("Error: Could not extract data from the PDF file.", target_lang)
self.build_knowledge_graph(df)
recommendations = self.recommend_medicines(symptoms, df, target_lang)
if not recommendations:
return self.translate_text("No matching recommendations found.", target_lang)
output = [self.translate_text("Ayurvedic Medicine Recommendations:", target_lang)]
for i, rec in enumerate(recommendations[:5], 1):
output.extend([
f"\n{i}. {self.translate_text('Medicine', target_lang)}: {rec['medicine']}",
f" {self.translate_text('Matching Condition', target_lang)}: {rec['condition']}",
f" {self.translate_text('Confidence Score', target_lang)}: {rec['confidence']:.2f}",
f" {self.translate_text('Recommended Remedies', target_lang)}:"
])
output.extend([f" - {remedy}" for remedy in rec['remedies']])
output.append("")
return "\n".join(output)
except Exception as e:
return f"Error: {str(e)}"
# Create and launch Gradio interface
def main():
recommender = MultilingualAyurvedicRecommender()
interface = gr.Interface(
fn=recommender.process_file_and_recommend,
inputs=[
gr.File(label="Upload Ayurvedic Home Remedies PDF"),
gr.Textbox(
label="Enter symptoms in any language (e.g., 'cold and fever' या 'सर्दी और बुखार' या 'జలుబు మరియు జ్వరం')"
),
gr.Dropdown(
choices=list(recommender.language_codes.values()),
label="Select output language",
value="English"
)
],
outputs=gr.Textbox(label="Recommendations"),
title="Multilingual Ayurvedic Medicine Recommender",
description="Get Ayurvedic medicine recommendations in your preferred language. Enter symptoms in any language!"
)
interface.launch(share=True)
if __name__ == "__main__":
main()