sarahai commited on
Commit
b8553ac
·
verified ·
1 Parent(s): a1af8f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -19
app.py CHANGED
@@ -1,7 +1,10 @@
1
  import streamlit as st
2
  from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer
 
 
 
 
3
 
4
- # Initialize models and tokenizers
5
  translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
6
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
7
  translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
@@ -10,10 +13,22 @@ summarization_model_name = 'sarahai/ruT5-base-summarizer'
10
  summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
11
  summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def split_into_chunks(text, tokenizer, max_length=150):
14
- # Tokenize the text and get ids
15
  tokens = tokenizer.tokenize(text)
16
- # Initialize chunks
17
  chunks = []
18
  current_chunk = []
19
  current_length = 0
@@ -24,7 +39,6 @@ def split_into_chunks(text, tokenizer, max_length=150):
24
  chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
25
  current_chunk = []
26
  current_length = 0
27
- # Add the last chunk if it's not empty
28
  if current_chunk:
29
  chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
30
  return chunks
@@ -46,18 +60,28 @@ def summarize(text, model, tokenizer, max_length=250):
46
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
47
  return summary
48
 
49
- # Streamlit UI
50
- st.title("Перевод с узбекского на русский и суммаризация")
51
- text = st.text_area("Введите текст на узбекском:", height=200)
52
-
53
- if st.button("Перевести и суммаризировать"):
54
- if text:
55
- with st.spinner('Переводим...'):
56
- translated_text = translate(text, translation_model, translation_tokenizer)
57
- st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
58
-
59
- with st.spinner('Суммаризируем...'):
60
- summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
61
- st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
62
- else:
63
- st.warning("Пожалуйста, введите текст на узбекском языке для перевода.")
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration, NllbTokenizer, T5Tokenizer
3
+ import easyocr
4
+ from PIL import Image
5
+ import numpy as np
6
+
7
 
 
8
  translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
9
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
10
  translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
 
13
  summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)
14
  summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
15
 
16
+ def extract_text(image_path, lang='uzb_Cyrl'):
17
+ reader = easyocr.Reader([lang])
18
+ results = reader.readtext(np.array(image_path))
19
+
20
+ all_text = ''
21
+ confidences = []
22
+
23
+ for (bbox, text, prob) in results:
24
+ all_text += ' ' + text
25
+ confidences.append(prob)
26
+
27
+ final_confidence = sum(confidences) / len(confidences) if confidences else 0
28
+ return all_text.strip(), final_confidence
29
+
30
  def split_into_chunks(text, tokenizer, max_length=150):
 
31
  tokens = tokenizer.tokenize(text)
 
32
  chunks = []
33
  current_chunk = []
34
  current_length = 0
 
39
  chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
40
  current_chunk = []
41
  current_length = 0
 
42
  if current_chunk:
43
  chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
44
  return chunks
 
60
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
61
  return summary
62
 
63
+ # Streamlit UI setup
64
+ st.title('Текстовая обработка изображений, перевод с узбекского на русский и суммаризация')
65
+
66
+ uploaded_file = st.file_uploader("Загрузите изображение с узбекским текстом...", type=["jpg", "jpeg", "png"])
67
+ if uploaded_file is not None:
68
+ image = Image.open(uploaded_file)
69
+ st.image(image, caption='Загруженное изображение', use_column_width=True)
70
+ st.write("Процесс извлечения текста...")
71
+ extracted_text, confidence = extract_text(image, 'tjk') # Adjust the language code if necessary
72
+ st.write("Извлеченный текст:")
73
+ st.text_area("Результат", extracted_text, height=150)
74
+ st.write(f"Точность распознавания: {confidence*100:.2f}%")
75
+
76
+ if st.button("Перевести и суммаризировать"):
77
+ if extracted_text:
78
+ with st.spinner('Переводим...'):
79
+ translated_text = translate(extracted_text, translation_model, translation_tokenizer)
80
+ st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
81
+
82
+ with st.spinner('Суммаризируем...'):
83
+ summary_text = summarize(translated_text, summarization_model, summarization_tokenizer, max_length=250)
84
+ st.text_area("Суммаризация (на русском):", value=summary_text, height=100)
85
+ else:
86
+ st.warning("Текст для перевода не найден.")
87
+