File size: 7,045 Bytes
80f5255
1f706a9
 
4daa7b9
1f706a9
 
 
 
 
2d45773
 
857662e
1f706a9
 
00bdbd5
 
 
 
 
 
fbf87c9
00bdbd5
 
 
 
 
 
 
 
 
 
 
 
 
 
1f706a9
857662e
297d1e5
1f706a9
 
 
9de6643
1f706a9
297d1e5
0cfb3b7
2d45773
 
 
1f706a9
 
2d45773
1f706a9
2d45773
 
 
 
 
1f706a9
 
 
 
 
 
 
 
ff275e2
1f706a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00bdbd5
1f706a9
 
 
 
 
 
 
 
 
 
 
6c3f51d
 
 
 
 
2d45773
1f706a9
2d45773
297d1e5
 
 
1f706a9
297d1e5
 
 
 
1f706a9
 
 
 
 
 
 
 
 
 
2d45773
1f706a9
 
 
 
 
2d45773
 
 
1f706a9
2d45773
 
 
 
1f706a9
2d45773
1f706a9
 
 
 
 
 
 
 
 
2d45773
67bfb96
 
0da35d6
 
 
 
67bfb96
0da35d6
 
67bfb96
f485c44
1f706a9
2d45773
 
 
67bfb96
 
1f706a9
2d45773
 
1f706a9
2d45773
6c3f51d
 
 
 
 
2d45773
297d1e5
2d45773
0cfb3b7
 
1f706a9
 
 
0cfb3b7
1f706a9
 
32ea06e
1f706a9
 
6c3f51d
0cfb3b7
d63c5ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import os
import re
import json
import torch
import requests
import unicodedata
import soundfile as sf
import pymorphy2

import gradio as gr
import wikipediaapi
from PIL import Image
from transformers import pipeline, CLIPProcessor, CLIPModel

import inspect

if not hasattr(inspect, 'getargspec'):
    def getargspec(func):
        sig = inspect.signature(func)
        defaults = []
        args = []
        varargs = None
        varkw = None
        for name, param in sig.parameters.items():
            if param.default != param.empty:
                defaults.append(param.default)
            if param.kind == param.VAR_POSITIONAL:
                varargs = name
            elif param.kind == param.VAR_KEYWORD:
                varkw = name
            else:
                args.append(name)
        return args, varargs, varkw, tuple(defaults) if defaults else None
    inspect.getargspec = getargspec

morph = pymorphy2.MorphAnalyzer()

def load_attractions_json(url):
    r = requests.get(url)
    r.raise_for_status()
    return json.loads(r.text)

url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
landmark_titles = load_attractions_json(url)

def clean_text(text):
    text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    def rm_diacritics(c):
        return '' if unicodedata.category(c) == 'Mn' else c
    text = unicodedata.normalize('NFD', text)
    text = ''.join(rm_diacritics(c) for c in text)
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

# Упрощенное определение падежа по предлогу
def get_case_for_preposition(prep):
    d = {
        'в': 'loc', 'на': 'loc', 'о': 'loc', 'об': 'loc', 'обо': 'loc',
        'к': 'dat',
        'с': 'ins', 'со': 'ins', 'над': 'ins', 'под': 'ins',
        'из': 'gen', 'от': 'gen', 'у': 'gen', 'до': 'gen', 'для': 'gen'
    }
    return d.get(prep.lower(), 'nomn')

def replace_numbers_with_text_in_context(text):
    tokens = text.split()
    result = []
    for i, token in enumerate(tokens):
        if re.match(r'^\d+(\.\d+)?$', token):
            cse = 'nom'
            if i > 0:
                cse = get_case_for_preposition(tokens[i - 1])
            # Сначала переводим число в текст (nominative)
            from num2words import num2words
            number_as_words = num2words(float(token) if '.' in token else int(token), lang='ru')
            number_as_words = number_as_words.replace('-', ' ')
            subtokens = number_as_words.split()
            inflected_subtokens = []
            for st in subtokens:
                p = morph.parse(st)
                if p:
                    best = p[0]
                    if cse in best.tag.case:
                        form = best.inflect({cse})
                        inflected_subtokens.append(form.word if form else st)
                    else:
                        inflected_subtokens.append(st)
                else:
                    inflected_subtokens.append(st)
            result.append(' '.join(inflected_subtokens))
        else:
            result.append(token)
    return ' '.join(result)

summarizer = pipeline(
     "summarization",
     model="sshleifer/distilbart-cnn-12-6",
     tokenizer="sshleifer/distilbart-cnn-12-6"
)
translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
wiki = wikipediaapi.Wikipedia("Nikita", "en")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

text_inputs = clip_processor(text=landmark_titles, images=None, return_tensors="pt", padding=True)
with torch.no_grad():
    text_embeds = clip_model.get_text_features(**text_inputs)
    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

language = 'ru'
model_id = 'v3_1_ru'
sample_rate = 48000
speaker = 'eugene'
silero_model, _ = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language=language,
    speaker=model_id
)

def text_to_speech(text, out_path="speech.wav"):
    text = replace_numbers_with_text_in_context(text)
    audio = silero_model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
    sf.write(out_path, audio, sample_rate)
    return out_path

def fetch_wikipedia_summary(landmark):
    page = wiki.page(landmark)
    return clean_text(page.summary) if page.exists() else "Found error!"

def recognize_landmark_clip(image):
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    img_in = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        img_embed = clip_model.get_image_features(**img_in)
        img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
    sim = (img_embed @ text_embeds.T).squeeze(0)
    best_idx = sim.argmax().item()
    return landmark_titles[best_idx], sim[best_idx].item()

def process_landmark(landmark):
    txt = fetch_wikipedia_summary(landmark)
    if txt == "Found error!":
        return None
    print('Wiki text: ')
    print(txt)
    if len(txt) < 210:
        summary = txt
    else:
        summary = summarizer(txt, min_length=10, max_length=200)[0]["summary_text"]
    print('Summarized text: ')
    print(summary)
    tr = translator(summary, max_length=1000)[0]["translation_text"]
    print('Translated text: ')
    print(tr)
    return text_to_speech(tr)

def process_image_clip(image):
    recognized, score = recognize_landmark_clip(image)
    print('Recognized: ')
    print(recognized)
    return process_landmark(recognized)

def process_text_clip(landmark):
    return process_landmark(landmark)

def reload_landmarks():
    global landmark_titles, text_embeds
    url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
    landmark_titles = load_attractions_json(url)

with gr.Blocks() as demo:
    gr.Markdown("## Помощь туристу")
    with gr.Tabs():
        with gr.Tab("CLIP + Sum + Translate + T2S"):
            with gr.Row():
                image_input = gr.Image(label="Загрузите фото", type="pil")
                text_input = gr.Textbox(label="Или введите название")
            audio_output = gr.Audio(label="Результат")
            with gr.Row():
                btn_img = gr.Button("Распознать и перевести")
                btn_txt = gr.Button("Поиск по названию")
                btn_reload = gr.Button("Обновить список (Техническое)")
            btn_img.click(fn=process_image_clip, inputs=image_input, outputs=audio_output)
            btn_txt.click(fn=process_text_clip, inputs=text_input, outputs=audio_output)
            btn_reload.click(fn=reload_landmarks, inputs=None, outputs=None)

demo.launch(debug=True)