Spaces:
Sleeping
Sleeping
Commit
·
d0b11df
1
Parent(s):
51c00f9
Add application file
Browse files- extract_text.py +0 -29
- models.py +0 -11
- requirements.txt +2 -12
- text_similarity.py +0 -125
extract_text.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
import cv2
|
2 |
-
import numpy as np
|
3 |
-
import easyocr
|
4 |
-
import torch
|
5 |
-
|
6 |
-
# Inicializar EasyOCR
|
7 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
8 |
-
reader = easyocr.Reader(["en"], gpu=(device == "cuda"), verbose=False)
|
9 |
-
|
10 |
-
def extract_text_from_image(img, gpu_available):
|
11 |
-
reader = easyocr.Reader(['en'], gpu=gpu_available, verbose=False)
|
12 |
-
|
13 |
-
img = np.array(img)
|
14 |
-
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
15 |
-
|
16 |
-
# Resizing and blurring
|
17 |
-
scale_factor = 2
|
18 |
-
upscaled = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
|
19 |
-
blur_img = cv2.blur(upscaled, (5, 5))
|
20 |
-
|
21 |
-
all_text_found = []
|
22 |
-
text_ = reader.readtext(blur_img, detail=1, paragraph=False, text_threshold=0.3)
|
23 |
-
|
24 |
-
for t in text_:
|
25 |
-
bbox, text, score = t
|
26 |
-
if score > 0.1: # Filter weak detections
|
27 |
-
all_text_found.append(text)
|
28 |
-
|
29 |
-
return all_text_found
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from pydantic import BaseModel
|
2 |
-
from typing import List
|
3 |
-
|
4 |
-
class RequestModel(BaseModel):
|
5 |
-
originId: int
|
6 |
-
source: str
|
7 |
-
|
8 |
-
class TextSimilarityRequest(BaseModel):
|
9 |
-
imageInfo: RequestModel
|
10 |
-
keyTexts: List[str]
|
11 |
-
similarityThreshold: float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,12 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
matplotlib
|
4 |
-
easyocr
|
5 |
-
scikit-image
|
6 |
-
pillow
|
7 |
-
pandas
|
8 |
-
torch
|
9 |
-
uvicorn
|
10 |
-
gradio
|
11 |
-
requests
|
12 |
-
starlette
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_similarity.py
DELETED
@@ -1,125 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
from difflib import SequenceMatcher
|
3 |
-
from collections import defaultdict
|
4 |
-
|
5 |
-
def extract_special_characters(text):
|
6 |
-
"""Extracts all unique special characters from a list of texts."""
|
7 |
-
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
|
8 |
-
return ''.join(characters)
|
9 |
-
|
10 |
-
def clean_text(text, keep=""):
|
11 |
-
"""Removes special characters except those specified in 'keep', and converts to lowercase."""
|
12 |
-
pattern = rf'[^\w\s{re.escape(keep)}]'
|
13 |
-
return re.sub(pattern, '', text.lower())
|
14 |
-
|
15 |
-
def text_similarity(text, key_text):
|
16 |
-
"""Calculates the similarity between two texts using SequenceMatcher."""
|
17 |
-
return SequenceMatcher(None, text, key_text).ratio()
|
18 |
-
|
19 |
-
def detect_fragments(text, key_texts, threshold=0.7):
|
20 |
-
"""Checks if a text contains fragments of key texts."""
|
21 |
-
for key_text in key_texts:
|
22 |
-
characters_to_not_clean = extract_special_characters(key_text)
|
23 |
-
words = clean_text(text, characters_to_not_clean).split()
|
24 |
-
|
25 |
-
key_words = key_text.split()
|
26 |
-
|
27 |
-
# If the text is too short, we can't make an effective sliding window
|
28 |
-
if len(words) < len(key_words):
|
29 |
-
similarity = text_similarity(text, key_text)
|
30 |
-
if similarity >= threshold:
|
31 |
-
return True, key_text, similarity
|
32 |
-
continue
|
33 |
-
|
34 |
-
# Sliding window to compare word sequences
|
35 |
-
for i in range(len(words) - len(key_words) + 1):
|
36 |
-
fragment = " ".join(words[i:i+len(key_words)])
|
37 |
-
similarity = text_similarity(fragment, key_text)
|
38 |
-
if similarity >= threshold:
|
39 |
-
return True, key_text, similarity
|
40 |
-
return False, None, 0
|
41 |
-
|
42 |
-
def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
|
43 |
-
"""
|
44 |
-
Analyzes the similarity between a list of texts and key texts.
|
45 |
-
Returns a detailed report on the similarities found.
|
46 |
-
"""
|
47 |
-
results = {
|
48 |
-
"similar_texts": [],
|
49 |
-
"fragments_detected": [],
|
50 |
-
"combined": [],
|
51 |
-
"statistics": defaultdict(int)
|
52 |
-
}
|
53 |
-
|
54 |
-
processed_texts = set()
|
55 |
-
|
56 |
-
# Check direct similarity
|
57 |
-
for i, text in enumerate(text_list):
|
58 |
-
if not text.strip():
|
59 |
-
continue
|
60 |
-
|
61 |
-
for key_text in key_texts:
|
62 |
-
if not key_text.strip():
|
63 |
-
continue
|
64 |
-
|
65 |
-
similarity = text_similarity(text, key_text)
|
66 |
-
if similarity >= similarity_threshold:
|
67 |
-
results["similar_texts"].append({
|
68 |
-
"index": i,
|
69 |
-
"text": text,
|
70 |
-
"key_text": key_text,
|
71 |
-
"similarity": similarity
|
72 |
-
})
|
73 |
-
results["statistics"]["direct_similarity"] += 1
|
74 |
-
processed_texts.add(i)
|
75 |
-
|
76 |
-
# Check fragments
|
77 |
-
# for i, text in enumerate(text_list):
|
78 |
-
# if i in processed_texts or not text.strip():
|
79 |
-
# continue
|
80 |
-
|
81 |
-
# has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
|
82 |
-
# if has_fragment:
|
83 |
-
# results["fragments_detected"].append({
|
84 |
-
# "index": i,
|
85 |
-
# "text": text,
|
86 |
-
# "key_text": key_text,
|
87 |
-
# "similarity": similarity
|
88 |
-
# })
|
89 |
-
# results["statistics"]["fragments"] += 1
|
90 |
-
# processed_texts.add(i)
|
91 |
-
|
92 |
-
# Check texts that can be combined
|
93 |
-
for i in range(len(text_list)):
|
94 |
-
if i in processed_texts or not text_list[i].strip():
|
95 |
-
continue
|
96 |
-
|
97 |
-
for j in range(i+1, len(text_list)):
|
98 |
-
if j in processed_texts or not text_list[j].strip():
|
99 |
-
continue
|
100 |
-
|
101 |
-
combined_text = text_list[i] + " " + text_list[j]
|
102 |
-
for key_text in key_texts:
|
103 |
-
if not key_text.strip():
|
104 |
-
continue
|
105 |
-
|
106 |
-
similarity = text_similarity(combined_text, key_text)
|
107 |
-
if similarity >= similarity_threshold:
|
108 |
-
results["combined"].append({
|
109 |
-
"indices": [i, j],
|
110 |
-
"texts": [text_list[i], text_list[j]],
|
111 |
-
"combined_text": combined_text,
|
112 |
-
"key_text": key_text,
|
113 |
-
"similarity": similarity
|
114 |
-
})
|
115 |
-
results["statistics"]["combined"] += 1
|
116 |
-
processed_texts.add(i)
|
117 |
-
processed_texts.add(j)
|
118 |
-
break
|
119 |
-
|
120 |
-
# Calculate overall statistics
|
121 |
-
valid_texts = sum(1 for text in text_list if text.strip())
|
122 |
-
results["statistics"]["total_analyzed"] = valid_texts
|
123 |
-
results["statistics"]["total_processed"] = len(processed_texts)
|
124 |
-
|
125 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|