|
import gradio as gr |
|
import json |
|
import torch |
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
with open("data/gpt2_ready_filtered.jsonl", "r", encoding="utf-8") as f: |
|
data = [json.loads(line) for line in f] |
|
|
|
texts = [item["text"] for item in data] |
|
|
|
|
|
class SomaliQA: |
|
def __init__(self, dataset_texts): |
|
self.texts = dataset_texts |
|
self.embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
|
self.embeddings = self.embedder.encode(self.texts, convert_to_tensor=True) |
|
self.tokenizer = GPT2Tokenizer.from_pretrained("zakihassan04/gpt2-finetuned-somali") |
|
self.model = GPT2LMHeadModel.from_pretrained("zakihassan04/gpt2-finetuned-somali") |
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
def extract_qa(self, text): |
|
parts = text.split("\nJawaab:") |
|
if len(parts) == 2: |
|
return parts[0].replace("Su'aal:", "").strip(), parts[1].strip() |
|
return None, None |
|
|
|
def clean_text(self, text): |
|
return text.strip().lower().rstrip("?").replace("’", "'").replace(" ", " ") |
|
|
|
def answer(self, user_question): |
|
if not user_question.strip().endswith("?"): |
|
user_question += "?" |
|
|
|
user_clean = self.clean_text(user_question) |
|
|
|
|
|
for text in self.texts: |
|
su_aal, jawaab = self.extract_qa(text) |
|
if su_aal and user_clean == self.clean_text(su_aal): |
|
return jawaab |
|
|
|
|
|
user_emb = self.embedder.encode(user_clean, convert_to_tensor=True) |
|
hits = util.semantic_search(user_emb, self.embeddings, top_k=1) |
|
if hits and len(hits[0]) > 0: |
|
idx = hits[0][0]['corpus_id'] |
|
su_aal, jawaab = self.extract_qa(self.texts[idx]) |
|
return jawaab |
|
|
|
return "Ma helin jawaab ku habboon su’aashaada." |
|
|
|
|
|
qa_system = SomaliQA(texts) |
|
|
|
|
|
def qa_interface(question): |
|
return qa_system.answer(question) |
|
|
|
|
|
gr.Interface( |
|
fn=qa_interface, |
|
inputs="text", |
|
outputs="text", |
|
title="Somali GPT-2 QA System (Dataset-based)", |
|
description="Weydii su’aal ku saabsan beeraha — waxaad helaysaa jawaab sax ah oo laga soo qaaday dataset-kaaga.", |
|
theme="compact" |
|
).launch() |
|
|