File size: 2,766 Bytes
137df29
 
 
 
 
 
 
 
9e599fd
137df29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py
import gradio as gr
from transformers import pipeline, AutoModel, AutoProcessor
import torch
import os
import numpy as np
from PIL import Image

# Initialize models (outside process function)
summarizer = pipeline("summarization", "csebuetnlp/mT5_multilingual_XLSum")
translator_ar2en = pipeline("translation_ar_to_en", "Helsinki-NLP/opus-mt-ar-en")
clip_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Image preprocessing
def precompute_embeddings(image_dir="images"):
    image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    embeddings = []
    for path in image_paths:
        image = Image.open(path)
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            embeddings.append(clip_model.get_image_features(**inputs))
    return image_paths, torch.cat(embeddings)

image_paths, image_embeddings = precompute_embeddings()

def process(input_text, language):
    # Text summarization
    summary = summarizer(input_text, max_length=150, min_length=30)[0]['summary_text']
    
    # Translation if Arabic
    if language == "Arabic":
        translated = translator_ar2en(summary)[0]['translation_text']
        query_text = translated
    else:
        query_text = summary
    
    # Text-image retrieval
    text_inputs = clip_processor(
        text=query_text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    )
    with torch.no_grad():
        text_emb = clip_model.get_text_features(**text_inputs)
    
    similarities = (text_emb @ image_embeddings.T).softmax(dim=-1)
    top_indices = similarities.topk(3).indices.numpy()
    results = [image_paths[i] for i in top_indices]
    
    return summary, translated if language == "Arabic" else "", results

# Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌍 Multi-Task AI: Summarization & Image Retrieval")
    
    with gr.Row():
        lang = gr.Dropdown(["English", "Arabic"], label="Input Language")
        text_input = gr.Textbox(label="Input Text", lines=5)
    
    with gr.Row():
        summary_out = gr.Textbox(label="Summary")
        trans_out = gr.Textbox(label="English Query Text", visible=False)
    
    gallery = gr.Gallery(label="Retrieved Images", columns=3)
    submit = gr.Button("Process", variant="primary")
    
    def toggle_translation(lang):
        return gr.update(visible=lang == "Arabic")
    
    lang.change(toggle_translation, lang, trans_out)
    submit.click(process, [text_input, lang], [summary_out, trans_out, gallery])

demo.launch()