File size: 10,396 Bytes
98b6d67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# app/utils/mahalla_matcher.py - YAXSHILANGAN (THRESHOLD 0.35)

"""

Mahalla Matcher - Noto'g'ri yozilgan mahalla nomlarini topish

Fuzzy matching + substring + word matching

"""

import logging
from typing import Optional, List
from difflib import SequenceMatcher

from app.services.location_validator import get_mahallas_by_district

logger = logging.getLogger(__name__)


def normalize_mahalla_text(text: str) -> str:
    """

    Mahalla nomini normalizatsiya qilish (KENGAYTIRILGAN)

    O'zbek tili qo'shimchalarini olib tashlaydi

    

    Args:

        text: Asl matn

        

    Returns:

        Normalized matn

    """
    if not text:
        return ""
    
    # Kichik harf
    text = text.lower().strip()
    
    # Ko'p probellarni bitta probelga
    text = " ".join(text.split())
    
    # "mahallasi", "mahalla" so'zlarini olib tashlash
    text = text.replace(' mahallasi', '').replace(' mahalla', '')
    text = text.replace('mahallasi', '').replace('mahalla', '')
    
    # O'zbek tili kelishik qo'shimchalarini olib tashlash (KENGAYTIRILGAN)
    suffixes = [
        "ni", "ga", "da", "dan", "ning", "niki", 
        "dagi", "dagina", "gacha", "dan", "dek",
        "lar", "larni", "larga", "larda", "lardan"
    ]
    
    words = text.split()
    
    if words:
        last_word = words[-1]
        for suffix in suffixes:
            if last_word.endswith(suffix) and len(last_word) > len(suffix) + 2:  # Juda qisqa so'zlarni saqlab qolish
                words[-1] = last_word[:-len(suffix)]
                break
    
    text = " ".join(words)
    
    # Oxiridagi tinish belgilarini olib tashlash
    text = text.strip('.,!? ')
    
    return text.strip()


def similarity_score(str1: str, str2: str) -> float:
    """

    Ikki string orasidagi o'xshashlik (0.0 - 1.0)

    

    Args:

        str1: Birinchi string

        str2: Ikkinchi string

        

    Returns:

        Similarity score (1.0 = 100% o'xshash)

    """
    return SequenceMatcher(None, str1, str2).ratio()


def word_similarity(str1: str, str2: str) -> float:
    """

    So'zma-so'z o'xshashlik (word-level matching)

    

    Args:

        str1: Birinchi matn

        str2: Ikkinchi matn

        

    Returns:

        Word overlap score (0.0 - 1.0)

    """
    words1 = set(str1.split())
    words2 = set(str2.split())
    
    if not words1 or not words2:
        return 0.0
    
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    
    return len(intersection) / len(union) if union else 0.0


def find_mahalla_fuzzy(district_name: str, user_text: str, threshold: float = 0.35) -> Optional[str]:
    """

    Noto'g'ri yozilgan mahalla nomini topish (YAXSHILANGAN - THRESHOLD 0.35)

    

    Args:

        district_name: Tuman nomi

        user_text: Bemorning kiritgan matni (masalan: "katta chilonzor" yoki "besh qorgon")

        threshold: Minimal o'xshashlik darajasi (0.35 = 35%) ← PASAYTIRILDI!

        

    Returns:

        Mahalla nomi (masalan: "Katta Chilonzor-1 mahallasi") yoki None

    """
    try:
        if not user_text or not district_name:
            return None
        
        # Matnni normalizatsiya qilish
        normalized_input = normalize_mahalla_text(user_text)
        logger.info(f"🏘️ Mahalla qidirilmoqda: '{user_text}' β†’ '{normalized_input}' ({district_name})")
        
        if len(normalized_input) < 2:
            logger.warning("⚠️ Matn juda qisqa")
            return None
        
        # Tuman bo'yicha mahallalarni olish
        mahallas = get_mahallas_by_district(district_name)
        
        if not mahallas:
            logger.warning(f"⚠️ {district_name} uchun mahallalar topilmadi")
            return None
        
        # Eng yaxshi moslikni topish
        best_match = None
        best_score = 0.0
        scoring_details = []
        
        for mahalla in mahallas:
            normalized_mahalla = normalize_mahalla_text(mahalla)
            
            # 1. To'liq fuzzy match (SequenceMatcher)
            fuzzy_score = similarity_score(normalized_input, normalized_mahalla)
            
            # 2. Substring match (KATTA BONUS)
            substring_score = 0.0
            if normalized_input in normalized_mahalla:
                substring_score = 0.9  # 90% match
                logger.debug(f"  βœ“ Substring (input in mahalla): '{normalized_input}' in '{normalized_mahalla}'")
            elif normalized_mahalla in normalized_input:
                substring_score = 0.85  # 85% match
                logger.debug(f"  βœ“ Substring (mahalla in input): '{normalized_mahalla}' in '{normalized_input}'")
            
            # 3. So'zma-so'z match (Word overlap)
            word_score = word_similarity(normalized_input, normalized_mahalla)
            
            # 4. So'z boshi match (First word matching)
            input_words = normalized_input.split()
            mahalla_words = normalized_mahalla.split()
            first_word_score = 0.0
            if input_words and mahalla_words:
                if input_words[0] == mahalla_words[0]:
                    first_word_score = 0.7  # Birinchi so'z mos kelsa - 70%
                    logger.debug(f"  βœ“ First word match: '{input_words[0]}'")
            
            # 5. FINAL SCORE (eng yuqori ball)
            final_score = max(fuzzy_score, substring_score, word_score, first_word_score)
            
            scoring_details.append({
                "mahalla": mahalla,
                "fuzzy": fuzzy_score,
                "substring": substring_score,
                "word": word_score,
                "first_word": first_word_score,
                "final": final_score
            })
            
            if final_score > best_score:
                best_score = final_score
                best_match = mahalla
        
        # Debug: Top 3 natijalar
        scoring_details.sort(key=lambda x: x['final'], reverse=True)
        logger.debug(f"  Top 3 matches:")
        for i, detail in enumerate(scoring_details[:3], 1):
            logger.debug(f"    {i}. {detail['mahalla']}: {detail['final']:.2f} "
                        f"(fuzzy={detail['fuzzy']:.2f}, sub={detail['substring']:.2f}, "
                        f"word={detail['word']:.2f}, first={detail['first_word']:.2f})")
        
        # Threshold tekshirish
        if best_score >= threshold:
            logger.info(f"βœ… Mahalla topildi: '{best_match}' (score: {best_score:.2f})")
            return best_match
        else:
            logger.warning(f"⚠️ Mahalla topilmadi (best score: {best_score:.2f} < {threshold})")
            return None
            
    except Exception as e:
        logger.error(f"❌ Mahalla matching xatoligi: {e}", exc_info=True)
        return None


def get_mahalla_display_name(mahalla_name: str) -> str:
    """

    Mahalla nomini to'liq formatda qaytarish

    

    Args:

        mahalla_name: "Beltepa" yoki "Beltepa mahallasi"

        

    Returns:

        "Beltepa mahallasi"

    """
    if not mahalla_name:
        return ""
    
    if 'mahallasi' not in mahalla_name.lower():
        return f"{mahalla_name} mahallasi"
    
    return mahalla_name


def suggest_mahallas(district_name: str, user_text: str, top_n: int = 3) -> List[tuple]:
    """

    Mahallalar tavsiyalari (top N eng o'xshash)

    

    Args:

        district_name: Tuman nomi

        user_text: Bemorning matni

        top_n: Ko'rsatiladigan maksimal tavsiyalar soni

        

    Returns:

        [(mahalla_name, score), ...] - eng yaxshilaridan boshlab

    """
    try:
        normalized_input = normalize_mahalla_text(user_text)
        mahallas = get_mahallas_by_district(district_name)
        
        if not mahallas:
            return []
        
        results = []
        
        for mahalla in mahallas:
            normalized_mahalla = normalize_mahalla_text(mahalla)
            
            # Barcha scoringlar
            fuzzy = similarity_score(normalized_input, normalized_mahalla)
            substring = 0.9 if normalized_input in normalized_mahalla else 0.0
            word = word_similarity(normalized_input, normalized_mahalla)
            
            final_score = max(fuzzy, substring, word)
            
            results.append((mahalla, final_score))
        
        # Scorecard bo'yicha tartiblash
        results.sort(key=lambda x: x[1], reverse=True)
        
        return results[:top_n]
        
    except Exception as e:
        logger.error(f"❌ Mahalla tavsiyalari xatoligi: {e}")
        return []


# ==================== TESTING HELPER ====================

def test_mahalla_matching(district_name: str, test_inputs: List[str]):
    """

    Mahalla matching'ni test qilish uchun helper

    

    Args:

        district_name: Test qilinadigan tuman

        test_inputs: Test qilinadigan inputlar ro'yxati

    """
    logger.info(f"\n{'='*60}")
    logger.info(f"TEST: {district_name}")
    logger.info(f"{'='*60}")
    
    for test_input in test_inputs:
        logger.info(f"\nTest input: '{test_input}'")
        
        result = find_mahalla_fuzzy(district_name, test_input, threshold=0.35)
        
        if result:
            logger.info(f"  βœ… MATCH: {result}")
        else:
            logger.warning(f"  ❌ NO MATCH")
            suggestions = suggest_mahallas(district_name, test_input, top_n=3)
            if suggestions:
                logger.info(f"  πŸ’‘ Suggestions:")
                for mahalla, score in suggestions:
                    logger.info(f"    - {mahalla} ({score:.2f})")
    
    logger.info(f"{'='*60}\n")


# ==================== AUTO-TEST ON IMPORT (DEBUG MODE) ====================

if __name__ == "__main__":
    # Test uchun
    logging.basicConfig(level=logging.DEBUG)
    
    test_cases = [
        ("Chilonzor tumani", ["katta chilonzor", "beltepa", "beshqozon", "qorgon"]),
        ("Bektemir tumani", ["abay", "bektemir", "binokor"]),
        ("Shayxontohur tumani", ["kamolon", "shayx"]),
    ]
    
    for district, inputs in test_cases:
        test_mahalla_matching(district, inputs)