File size: 7,045 Bytes
80f5255 1f706a9 4daa7b9 1f706a9 2d45773 857662e 1f706a9 00bdbd5 fbf87c9 00bdbd5 1f706a9 857662e 297d1e5 1f706a9 9de6643 1f706a9 297d1e5 0cfb3b7 2d45773 1f706a9 2d45773 1f706a9 2d45773 1f706a9 ff275e2 1f706a9 00bdbd5 1f706a9 6c3f51d 2d45773 1f706a9 2d45773 297d1e5 1f706a9 297d1e5 1f706a9 2d45773 1f706a9 2d45773 1f706a9 2d45773 1f706a9 2d45773 1f706a9 2d45773 67bfb96 0da35d6 67bfb96 0da35d6 67bfb96 f485c44 1f706a9 2d45773 67bfb96 1f706a9 2d45773 1f706a9 2d45773 6c3f51d 2d45773 297d1e5 2d45773 0cfb3b7 1f706a9 0cfb3b7 1f706a9 32ea06e 1f706a9 6c3f51d 0cfb3b7 d63c5ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import os
import re
import json
import torch
import requests
import unicodedata
import soundfile as sf
import pymorphy2
import gradio as gr
import wikipediaapi
from PIL import Image
from transformers import pipeline, CLIPProcessor, CLIPModel
import inspect
if not hasattr(inspect, 'getargspec'):
def getargspec(func):
sig = inspect.signature(func)
defaults = []
args = []
varargs = None
varkw = None
for name, param in sig.parameters.items():
if param.default != param.empty:
defaults.append(param.default)
if param.kind == param.VAR_POSITIONAL:
varargs = name
elif param.kind == param.VAR_KEYWORD:
varkw = name
else:
args.append(name)
return args, varargs, varkw, tuple(defaults) if defaults else None
inspect.getargspec = getargspec
morph = pymorphy2.MorphAnalyzer()
def load_attractions_json(url):
r = requests.get(url)
r.raise_for_status()
return json.loads(r.text)
url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
landmark_titles = load_attractions_json(url)
def clean_text(text):
text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
text = re.sub(r'\[.*?\]', '', text)
def rm_diacritics(c):
return '' if unicodedata.category(c) == 'Mn' else c
text = unicodedata.normalize('NFD', text)
text = ''.join(rm_diacritics(c) for c in text)
text = unicodedata.normalize('NFC', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text.strip()
# Упрощенное определение падежа по предлогу
def get_case_for_preposition(prep):
d = {
'в': 'loc', 'на': 'loc', 'о': 'loc', 'об': 'loc', 'обо': 'loc',
'к': 'dat',
'с': 'ins', 'со': 'ins', 'над': 'ins', 'под': 'ins',
'из': 'gen', 'от': 'gen', 'у': 'gen', 'до': 'gen', 'для': 'gen'
}
return d.get(prep.lower(), 'nomn')
def replace_numbers_with_text_in_context(text):
tokens = text.split()
result = []
for i, token in enumerate(tokens):
if re.match(r'^\d+(\.\d+)?$', token):
cse = 'nom'
if i > 0:
cse = get_case_for_preposition(tokens[i - 1])
# Сначала переводим число в текст (nominative)
from num2words import num2words
number_as_words = num2words(float(token) if '.' in token else int(token), lang='ru')
number_as_words = number_as_words.replace('-', ' ')
subtokens = number_as_words.split()
inflected_subtokens = []
for st in subtokens:
p = morph.parse(st)
if p:
best = p[0]
if cse in best.tag.case:
form = best.inflect({cse})
inflected_subtokens.append(form.word if form else st)
else:
inflected_subtokens.append(st)
else:
inflected_subtokens.append(st)
result.append(' '.join(inflected_subtokens))
else:
result.append(token)
return ' '.join(result)
summarizer = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
tokenizer="sshleifer/distilbart-cnn-12-6"
)
translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
wiki = wikipediaapi.Wikipedia("Nikita", "en")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
text_inputs = clip_processor(text=landmark_titles, images=None, return_tensors="pt", padding=True)
with torch.no_grad():
text_embeds = clip_model.get_text_features(**text_inputs)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
language = 'ru'
model_id = 'v3_1_ru'
sample_rate = 48000
speaker = 'eugene'
silero_model, _ = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=model_id
)
def text_to_speech(text, out_path="speech.wav"):
text = replace_numbers_with_text_in_context(text)
audio = silero_model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
sf.write(out_path, audio, sample_rate)
return out_path
def fetch_wikipedia_summary(landmark):
page = wiki.page(landmark)
return clean_text(page.summary) if page.exists() else "Found error!"
def recognize_landmark_clip(image):
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
img_in = clip_processor(images=image, return_tensors="pt")
with torch.no_grad():
img_embed = clip_model.get_image_features(**img_in)
img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
sim = (img_embed @ text_embeds.T).squeeze(0)
best_idx = sim.argmax().item()
return landmark_titles[best_idx], sim[best_idx].item()
def process_landmark(landmark):
txt = fetch_wikipedia_summary(landmark)
if txt == "Found error!":
return None
print('Wiki text: ')
print(txt)
if len(txt) < 210:
summary = txt
else:
summary = summarizer(txt, min_length=10, max_length=200)[0]["summary_text"]
print('Summarized text: ')
print(summary)
tr = translator(summary, max_length=1000)[0]["translation_text"]
print('Translated text: ')
print(tr)
return text_to_speech(tr)
def process_image_clip(image):
recognized, score = recognize_landmark_clip(image)
print('Recognized: ')
print(recognized)
return process_landmark(recognized)
def process_text_clip(landmark):
return process_landmark(landmark)
def reload_landmarks():
global landmark_titles, text_embeds
url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
landmark_titles = load_attractions_json(url)
with gr.Blocks() as demo:
gr.Markdown("## Помощь туристу")
with gr.Tabs():
with gr.Tab("CLIP + Sum + Translate + T2S"):
with gr.Row():
image_input = gr.Image(label="Загрузите фото", type="pil")
text_input = gr.Textbox(label="Или введите название")
audio_output = gr.Audio(label="Результат")
with gr.Row():
btn_img = gr.Button("Распознать и перевести")
btn_txt = gr.Button("Поиск по названию")
btn_reload = gr.Button("Обновить список (Техническое)")
btn_img.click(fn=process_image_clip, inputs=image_input, outputs=audio_output)
btn_txt.click(fn=process_text_clip, inputs=text_input, outputs=audio_output)
btn_reload.click(fn=reload_landmarks, inputs=None, outputs=None)
demo.launch(debug=True) |