TuanMinhajSeedin commited on
Commit
eaa41b7
·
verified ·
1 Parent(s): e19f1a0

Upload 10 files

Browse files
Files changed (9) hide show
  1. .gitignore +151 -175
  2. app.py +1215 -974
  3. enhanced_nlp_processor.py +904 -904
  4. language_detector.py +251 -0
  5. llm_query_processor.py +384 -351
  6. logger.py +61 -61
  7. neo4j_service.py +222 -222
  8. spell_corrector.py +257 -257
  9. translation_service.py +1057 -702
.gitignore CHANGED
@@ -1,175 +1,151 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
- *.egg-info/
25
- .installed.cfg
26
- *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
- *.manifest
33
- *.spec
34
-
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
- .coverage
44
- .coverage.*
45
- .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py,cover
50
- .hypothesis/
51
- .pytest_cache/
52
- cover/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- .pybuilder/
76
- target/
77
-
78
- # Jupyter Notebook
79
- .ipynb_checkpoints
80
-
81
- # IPython
82
- profile_default/
83
- ipython_config.py
84
-
85
- # pyenv
86
- # For a library or package, you might want to ignore these files since the code is
87
- # intended to run in multiple environments; otherwise, check them in:
88
- # .python-version
89
-
90
- # pipenv
91
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
- # install all needed dependencies.
95
- #Pipfile.lock
96
-
97
- # UV
98
- # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
- # This is especially recommended for binary packages to ensure reproducibility, and is more
100
- # commonly ignored for libraries.
101
- #uv.lock
102
-
103
- # poetry
104
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
- # This is especially recommended for binary packages to ensure reproducibility, and is more
106
- # commonly ignored for libraries.
107
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
- #poetry.lock
109
-
110
- # pdm
111
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
- #pdm.lock
113
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
- # in version control.
115
- # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
- .pdm.toml
117
- .pdm-python
118
- .pdm-build/
119
-
120
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
- __pypackages__/
122
-
123
- # Celery stuff
124
- celerybeat-schedule
125
- celerybeat.pid
126
-
127
- # SageMath parsed files
128
- *.sage.py
129
-
130
- # Environments
131
- config.py
132
- .env
133
- .venv
134
- env/
135
- venv/
136
- ENV/
137
- env.bak/
138
- venv.bak/
139
-
140
- # Spyder project settings
141
- .spyderproject
142
- .spyproject
143
-
144
- # Rope project settings
145
- .ropeproject
146
-
147
- # mkdocs documentation
148
- /site
149
-
150
- # mypy
151
- .mypy_cache/
152
- .dmypy.json
153
- dmypy.json
154
-
155
- # Pyre type checker
156
- .pyre/
157
-
158
- # pytype static type analyzer
159
- .pytype/
160
-
161
- # Cython debug symbols
162
- cython_debug/
163
-
164
- # PyCharm
165
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167
- # and can be added to the global gitignore or merged into this file. For a more nuclear
168
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169
- #.idea/
170
-
171
- # Ruff stuff:
172
- .ruff_cache/
173
-
174
- # PyPI configuration file
175
- .pypirc
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.*.local
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ pip-wheel-metadata/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+
73
+ # PyBuilder
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ Pipfile.lock
88
+
89
+ # PEP 582
90
+ __pypackages__/
91
+
92
+ # Celery stuff
93
+ celerybeat-schedule
94
+ celerybeat.pid
95
+
96
+ # SageMath parsed files
97
+ *.sage.py
98
+
99
+ # Environments
100
+ .venv
101
+ env/
102
+ venv/
103
+ ENV/
104
+ env.bak/
105
+ venv.bak/
106
+
107
+ # Spyder project settings
108
+ .spyderproject
109
+ .spyproject
110
+
111
+ # Rope project settings
112
+ .ropeproject
113
+
114
+ # mkdocs documentation
115
+ /site
116
+
117
+ # mypy
118
+ .mypy_cache/
119
+ .dmypy.json
120
+ dmypy.json
121
+
122
+ # Pyre type checker
123
+ .pyre/
124
+
125
+ # IDE
126
+ .vscode/
127
+ .idea/
128
+ *.swp
129
+ *.swo
130
+ *~
131
+
132
+ # OS
133
+ .DS_Store
134
+ .DS_Store?
135
+ ._*
136
+ .Spotlight-V100
137
+ .Trashes
138
+ ehthumbs.db
139
+ Thumbs.db
140
+
141
+ # Logs
142
+ logs/
143
+ *.log
144
+
145
+ # Database
146
+ *.db
147
+ *.sqlite
148
+
149
+ # Temporary files
150
+ *.tmp
151
+ *.temp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,974 +1,1215 @@
1
- #!/usr/bin/env python3
2
- """
3
- Main Flask Application for Transport Query System
4
- """
5
-
6
- from flask import Flask, render_template, request, jsonify, session
7
- import os
8
- from llm_query_processor import LLMQueryProcessor
9
- from enhanced_nlp_processor import EnhancedNLPProcessor
10
- from spell_corrector import SpellCorrector
11
- from neo4j_service import Neo4jService
12
- from translation_service import TranslationService
13
- from logger import get_logger
14
- from config import Config
15
-
16
- app = Flask(__name__)
17
- app.config.from_object(Config)
18
- logger = get_logger("FlaskApp")
19
-
20
- # Initialize services
21
- query_processor = LLMQueryProcessor()
22
- enhanced_nlp_processor = EnhancedNLPProcessor()
23
- spell_corrector = SpellCorrector()
24
- neo4j_service = Neo4jService()
25
- translation_service = TranslationService()
26
-
27
- @app.route('/')
28
- def index():
29
- """Main page"""
30
- return render_template('index.html')
31
-
32
- @app.route('/api/query', methods=['POST'])
33
- def process_query():
34
- """Process user query with enhanced NLP and translation support"""
35
- try:
36
- data = request.get_json()
37
- user_query = data.get('query', '').strip()
38
- use_enhanced_nlp = data.get('enhanced_nlp', True) # Default to enhanced NLP
39
-
40
- if not user_query:
41
- return jsonify({
42
- 'success': False,
43
- 'message': 'Please enter a query.'
44
- })
45
-
46
- # Check if query is in Sinhala and translate if needed
47
- translation_info = translation_service.translate_query(user_query)
48
-
49
- # Use translated query for processing
50
- query_to_process = translation_info['translated_query']
51
-
52
- # Log translation info to console
53
- if translation_info['is_sinhala']:
54
- logger.info(f"Translation: si->en method={translation_info['translation_method']} original='{translation_info['original_query']}' translated='{translation_info['translated_query']}'")
55
- else:
56
- logger.info(f"Processing English Query: '{user_query}'")
57
-
58
- # Process the query with enhanced NLP or fallback to basic processor
59
- if use_enhanced_nlp:
60
- result = enhanced_nlp_processor.process_query(query_to_process)
61
- else:
62
- result = query_processor.process_query(query_to_process)
63
-
64
- # If original query was in Sinhala, translate the response back
65
- if translation_info['is_sinhala']:
66
- print(f" English Response: {result.get('message', 'No message')}")
67
- result = translation_service.translate_response(result)
68
- result['translation_info'] = translation_info
69
- print(f" Sinhala Response: {result.get('message', 'No message')}")
70
- print(f" Translation Complete ✅")
71
-
72
- logger.info(f"Response success={result.get('success')} type={result.get('query_type','n/a')} message='{result.get('message','')[:120]}'")
73
- return jsonify(result)
74
-
75
- except Exception as e:
76
- return jsonify({
77
- 'success': False,
78
- 'message': f'Error processing query: {str(e)}'
79
- })
80
-
81
- @app.route('/api/suggestions', methods=['POST'])
82
- def get_suggestions():
83
- """Get location suggestions for autocomplete"""
84
- try:
85
- data = request.get_json()
86
- partial_location = data.get('location', '').strip()
87
-
88
- if not partial_location:
89
- return jsonify({'suggestions': []})
90
-
91
- suggestions = spell_corrector.get_suggestions(partial_location)
92
-
93
- return jsonify({
94
- 'suggestions': [{'name': name, 'confidence': conf} for name, conf in suggestions]
95
- })
96
-
97
- except Exception as e:
98
- return jsonify({
99
- 'success': False,
100
- 'message': f'Error getting suggestions: {str(e)}'
101
- })
102
-
103
- @app.route('/api/status')
104
- def get_status():
105
- """Get system status"""
106
- try:
107
- neo4j_connected = neo4j_service.is_connected()
108
- places = neo4j_service.get_all_places() if neo4j_connected else []
109
- stats = neo4j_service.get_route_statistics() if neo4j_connected else {}
110
-
111
- return jsonify({
112
- 'neo4j_connected': neo4j_connected,
113
- 'total_places': len(places),
114
- 'statistics': stats
115
- })
116
-
117
- except Exception as e:
118
- return jsonify({
119
- 'success': False,
120
- 'message': f'Error getting status: {str(e)}'
121
- })
122
-
123
- @app.route('/api/places')
124
- def get_places():
125
- """Get all available places"""
126
- try:
127
- places = neo4j_service.get_all_places()
128
- return jsonify({
129
- 'success': True,
130
- 'places': places
131
- })
132
-
133
- except Exception as e:
134
- return jsonify({
135
- 'success': False,
136
- 'message': f'Error getting places: {str(e)}'
137
- })
138
-
139
- @app.route('/api/sinhala/examples')
140
- def get_sinhala_examples():
141
- """Get example queries in Sinhala"""
142
- try:
143
- sinhala_examples = translation_service.get_sinhala_examples()
144
- return jsonify({
145
- 'success': True,
146
- 'examples': sinhala_examples
147
- })
148
-
149
- except Exception as e:
150
- return jsonify({
151
- 'success': False,
152
- 'message': f'Error getting Sinhala examples: {str(e)}'
153
- })
154
-
155
- @app.route('/api/translation/test')
156
- def test_translation():
157
- """Test translation functionality"""
158
- try:
159
- test_results = translation_service.test_translation()
160
- return jsonify({
161
- 'success': True,
162
- 'test_results': test_results
163
- })
164
-
165
- except Exception as e:
166
- return jsonify({
167
- 'success': False,
168
- 'message': f'Error testing translation: {str(e)}'
169
- })
170
-
171
- @app.route('/api/translation/translate', methods=['POST'])
172
- def translate_text():
173
- """Translate text between Sinhala and English"""
174
- try:
175
- data = request.get_json()
176
- text = data.get('text', '').strip()
177
- target_lang = data.get('target_lang', 'en') # 'en' or 'si'
178
- source_lang = data.get('source_lang', 'auto')
179
-
180
- if not text:
181
- return jsonify({
182
- 'success': False,
183
- 'message': 'Please provide text to translate.'
184
- })
185
-
186
- translated_text = translation_service.translate_text(text, target_lang, source_lang)
187
- is_sinhala = translation_service.is_sinhala_text(text)
188
-
189
- return jsonify({
190
- 'success': True,
191
- 'original_text': text,
192
- 'translated_text': translated_text,
193
- 'source_language': 'si' if is_sinhala else 'en',
194
- 'target_language': target_lang,
195
- 'translation_method': 'google' if translation_service.google_translate_api_key else 'dictionary'
196
- })
197
-
198
- except Exception as e:
199
- return jsonify({
200
- 'success': False,
201
- 'message': f'Error translating text: {str(e)}'
202
- })
203
-
204
- @app.route('/api/nlp/capabilities')
205
- def get_nlp_capabilities():
206
- """Get information about natural language processing capabilities with live examples"""
207
-
208
- # Test queries for each type to demonstrate actual results
209
- test_queries = [
210
- {
211
- 'type': 'fare_inquiry',
212
- 'description': 'Find fare between two specific locations',
213
- 'examples': [
214
- 'What is the fare from Colombo to Kandy?',
215
- 'fare of anuradhapura to kandy',
216
- 'price from panadura to galle',
217
- 'Colombo to Kandy fare'
218
- ]
219
- },
220
- {
221
- 'type': 'comparison',
222
- 'description': 'Compare fares between different routes',
223
- 'examples': [
224
- 'Compare fares from Colombo to Kandy vs Colombo to Galle',
225
- 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?',
226
- 'What is the difference in fare between Panadura to Galle and Panadura to Matara?'
227
- ]
228
- },
229
- {
230
- 'type': 'range_search',
231
- 'description': 'Find routes within specific price ranges',
232
- 'examples': [
233
- 'Find routes under 500 rupees',
234
- 'Show me routes between 200 and 800 rupees',
235
- 'Routes over 1000 rupees'
236
- ]
237
- },
238
- {
239
- 'type': 'recommendation',
240
- 'description': 'Get route recommendations based on criteria',
241
- 'examples': [
242
- 'Recommend cheap routes',
243
- 'Show me popular destinations',
244
- 'What are the best routes from Colombo?'
245
- ]
246
- },
247
- {
248
- 'type': 'route_inquiry',
249
- 'description': 'Find routes from/to specific locations',
250
- 'examples': [
251
- 'Routes from Colombo',
252
- 'Routes to Galle',
253
- 'What routes depart from Kandy?'
254
- ]
255
- },
256
- {
257
- 'type': 'statistics',
258
- 'description': 'Get database overview and statistics',
259
- 'examples': [
260
- 'What is the average fare?',
261
- 'Database statistics',
262
- 'How many routes are there?'
263
- ]
264
- }
265
- ]
266
-
267
- # Process each test query to get actual results
268
- live_examples = []
269
- for query_type in test_queries:
270
- type_examples = []
271
- for example_query in query_type['examples'][:2]: # Test first 2 examples
272
- try:
273
- result = enhanced_nlp_processor.process_query(example_query)
274
- type_examples.append({
275
- 'query': example_query,
276
- 'result': result
277
- })
278
- except Exception as e:
279
- type_examples.append({
280
- 'query': example_query,
281
- 'result': {
282
- 'success': False,
283
- 'message': f'Error: {str(e)}'
284
- }
285
- })
286
-
287
- live_examples.append({
288
- 'type': query_type['type'],
289
- 'description': query_type['description'],
290
- 'examples': type_examples
291
- })
292
-
293
- capabilities = {
294
- 'natural_language_processing': {
295
- 'description': 'Advanced NLP for transport queries with enhanced understanding',
296
- 'features': [
297
- 'Multiple query formats (fare, price, cost)',
298
- 'Natural language patterns (from X to Y, X to Y fare, etc.)',
299
- 'Question formats (What is, How much, Show me, etc.)',
300
- 'Compact formats (Colombo to Kandy fare)',
301
- 'Spell correction and fuzzy matching',
302
- 'Automatic location name correction',
303
- 'LLM-powered query interpretation',
304
- 'Fallback keyword-based processing',
305
- 'Advanced intent classification',
306
- 'Entity extraction and normalization',
307
- 'Confidence scoring for query understanding'
308
- ]
309
- },
310
- 'query_types': test_queries,
311
- 'live_examples': live_examples,
312
- 'spell_correction': {
313
- 'description': 'Automatic location name correction',
314
- 'methods': [
315
- 'Direct mapping (exact matches)',
316
- 'Fuzzy matching (similar names)',
317
- 'LLM correction (AI-powered)',
318
- 'Partial matching (substring matching)'
319
- ],
320
- 'examples': [
321
- 'panadra Panadura',
322
- 'gale → Galle',
323
- 'colmbo → Colombo',
324
- 'kandee → Kandy'
325
- ]
326
- },
327
- 'llm_integration': {
328
- 'description': 'AI-powered query interpretation with LLM Cypher generation',
329
- 'features': [
330
- 'Automatic query type detection',
331
- 'LLM-powered Cypher query generation',
332
- 'Natural language understanding',
333
- 'Fallback to keyword-based processing',
334
- 'Advanced entity extraction',
335
- 'Intent classification with confidence scoring',
336
- 'Real-time database querying'
337
- ]
338
- },
339
- 'enhanced_features': {
340
- 'description': 'Advanced NLP capabilities',
341
- 'features': [
342
- 'Multi-intent query understanding',
343
- 'Context-aware responses',
344
- 'Query preprocessing and normalization',
345
- 'Advanced pattern matching',
346
- 'Confidence-based result ranking',
347
- 'Comprehensive query analysis',
348
- 'Live database results for all query types'
349
- ]
350
- }
351
- }
352
-
353
- return jsonify({
354
- 'success': True,
355
- 'capabilities': capabilities
356
- })
357
-
358
- @app.route('/api/nlp/test', methods=['POST'])
359
- def test_nlp_query():
360
- """Test a natural language query and return detailed analysis"""
361
- try:
362
- data = request.get_json()
363
- user_query = data.get('query', '').strip()
364
- use_enhanced_nlp = data.get('enhanced_nlp', True)
365
-
366
- if not user_query:
367
- return jsonify({
368
- 'success': False,
369
- 'message': 'Please provide a query to test.'
370
- })
371
-
372
- # Get detailed analysis
373
- analysis = {
374
- 'original_query': user_query,
375
- 'processing_steps': []
376
- }
377
-
378
- # Step 1: Extract locations
379
- locations = spell_corrector.extract_locations_from_query(user_query)
380
- analysis['processing_steps'].append({
381
- 'step': 'Location Extraction',
382
- 'locations_found': len(locations),
383
- 'details': [
384
- {
385
- 'original': loc[0],
386
- 'corrected': loc[1],
387
- 'confidence': loc[2],
388
- 'method': loc[3]
389
- } for loc in locations
390
- ]
391
- })
392
-
393
- # Step 2: Process query with enhanced NLP
394
- if use_enhanced_nlp:
395
- result = enhanced_nlp_processor.process_query(user_query)
396
- analysis['processing_steps'].append({
397
- 'step': 'Enhanced NLP Processing',
398
- 'success': result.get('success', False),
399
- 'query_type': result.get('query_type', 'unknown'),
400
- 'message': result.get('message', ''),
401
- 'confidence': result.get('query_analysis', {}).get('confidence', 0),
402
- 'intent': result.get('query_analysis', {}).get('intent', {}),
403
- 'entities': result.get('query_analysis', {}).get('entities', {})
404
- })
405
- else:
406
- result = query_processor.process_query(user_query)
407
- analysis['processing_steps'].append({
408
- 'step': 'Basic Query Processing',
409
- 'success': result.get('success', False),
410
- 'query_type': result.get('query_type', 'unknown'),
411
- 'message': result.get('message', ''),
412
- 'cypher_query': result.get('cypher_query', ''),
413
- 'corrections': result.get('corrections', [])
414
- })
415
-
416
- # Step 3: Results
417
- if result.get('success') and result.get('data'):
418
- analysis['processing_steps'].append({
419
- 'step': 'Database Results',
420
- 'results_count': len(result['data']),
421
- 'sample_results': result['data'][:3] # Show first 3 results
422
- })
423
-
424
- return jsonify({
425
- 'success': True,
426
- 'analysis': analysis,
427
- 'result': result
428
- })
429
-
430
- except Exception as e:
431
- return jsonify({
432
- 'success': False,
433
- 'message': f'Error testing NLP query: {str(e)}'
434
- })
435
-
436
- @app.route('/api/nlp/demo')
437
- def get_nlp_demo():
438
- """Get a comprehensive demo of natural language capabilities"""
439
- demo_queries = [
440
- {
441
- 'category': 'Basic Fare Queries',
442
- 'queries': [
443
- 'What is the fare from Colombo to Kandy?',
444
- 'fare of anuradhapura to kandy',
445
- 'price from panadura to galle',
446
- 'Colombo to Kandy fare'
447
- ]
448
- },
449
- {
450
- 'category': 'Comparison Queries',
451
- 'queries': [
452
- 'Compare fares from Colombo to Kandy vs Colombo to Galle',
453
- 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?',
454
- 'What is the difference in fare between Panadura to Galle and Panadura to Matara?'
455
- ]
456
- },
457
- {
458
- 'category': 'Range Search Queries',
459
- 'queries': [
460
- 'Find routes under 500 rupees',
461
- 'Show me routes between 200 and 800 rupees',
462
- 'Routes over 1000 rupees'
463
- ]
464
- },
465
- {
466
- 'category': 'Recommendation Queries',
467
- 'queries': [
468
- 'Recommend cheap routes',
469
- 'Show me popular destinations',
470
- 'What are the best routes from Colombo?'
471
- ]
472
- },
473
- {
474
- 'category': 'Statistical Queries',
475
- 'queries': [
476
- 'What is the average fare?',
477
- 'Database statistics',
478
- 'How many routes are there?'
479
- ]
480
- },
481
- {
482
- 'category': 'Route Queries',
483
- 'queries': [
484
- 'Show me the cheapest routes',
485
- 'Routes from Colombo',
486
- 'Routes to Galle',
487
- 'What routes depart from Kandy?'
488
- ]
489
- },
490
- {
491
- 'category': 'Spell Correction Tests',
492
- 'queries': [
493
- 'price from panadra to gale',
494
- 'fare of colmbo to kandee',
495
- 'cost from anuradapura to kandy'
496
- ]
497
- }
498
- ]
499
-
500
- return jsonify({
501
- 'success': True,
502
- 'demo': {
503
- 'title': 'Enhanced Natural Language Transport Query Demo',
504
- 'description': 'Advanced NLP capabilities with comparison, range search, and recommendations',
505
- 'categories': demo_queries
506
- }
507
- })
508
-
509
- @app.route('/api/examples')
510
- def get_examples():
511
- """Get comprehensive example queries showcasing natural language capabilities"""
512
- examples = [
513
- # === FARE QUERIES (Various Natural Language Formats) ===
514
- {
515
- 'category': 'Fare Queries',
516
- 'examples': [
517
- {
518
- # 'query': 'What is the fare from Colombo to Kandy?',
519
- 'query': 'කොළඹ සිට මහනුවරට ගාස්තුව කීයද?',
520
- 'description': 'Standard fare query format'
521
- },
522
- {
523
- 'query': 'පානදුරේ ඉඳන් ගාල්ලට කීයක් යනවද?',
524
- 'description': 'Alternative way to ask for fare'
525
- },
526
- {
527
- 'query': 'අනුරාධපුර සිට මහනුවර දක්වා ගාස්තුව',
528
- 'description': 'Natural language format'
529
- },
530
- {
531
- # 'query': 'price from panadura to galle',
532
- 'query': 'පානදුරේ ඉඳන් ගාල්ලට කීයක් යනවද?',
533
- 'description': 'Using "price" instead of "fare"'
534
- },
535
- {
536
- # 'query': 'Colombo to nuwara eliya fare',
537
- 'query': 'බදුල්ල සිට කොළඹට ගාස්තුව කීයද?',
538
- 'description': 'Compact format'
539
- },
540
- {
541
- # 'query': 'How much is the fare from matara to kandy?',
542
- 'query': 'මහනුවර සිට මාතරට ගාස්තුව කීයද?',
543
- 'description': 'Question format'
544
- }
545
- ]
546
- },
547
-
548
- # === COMPARISON QUERIES ===
549
- {
550
- 'category': 'Comparison Queries',
551
- 'examples': [
552
- {
553
- # 'query': 'Compare fares from Colombo to Kandy vs Colombo to Galle',
554
- 'query': 'කොළඹ සිට මහනුවර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සංසන්දනය කරන්න.',
555
- 'description': 'Compare two different routes'
556
- },
557
- {
558
- # 'query': 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?',
559
- 'query': 'කොළඹ සිට මහනුවර දක්වා සහ කොළඹ සිට අනුරාධපුර දක්වා ලාභදායී වන්නේ කුමක්ද?',
560
- 'description': 'Find the cheaper option'
561
- },
562
- {
563
- # 'query': 'What is the difference in fare between Panadura to Galle and Panadura to Matara?',
564
- 'query': 'පානදුර සිට ගාල්ල දක්වා සහ පානදුර සිට මාතර දක්වා ගාස්තුවේ වෙනස කීයද?',
565
- 'description': 'Calculate fare difference'
566
- }
567
- ]
568
- },
569
-
570
- # === RANGE SEARCH QUERIES ===
571
- {
572
- 'category': 'Range Search Queries',
573
- 'examples': [
574
- {
575
- # 'query': 'Find routes under 500 rupees',
576
- 'query': 'රුපියල් 500ට අඩු මාර්ග සොයා ගන්න',
577
- 'description': 'Find affordable routes'
578
- },
579
- {
580
- # 'query': 'Show me routes between 200 and 800 rupees',
581
- 'query': 'රුපියල් 200 සහ 800 අතර මාර්ග සොයා ගන්න',
582
- 'description': 'Find routes in price range'
583
- },
584
- {
585
- # 'query': 'Routes over 1000 rupees',
586
- 'query': 'රුපියල් 1000ට ඉහළ මාර්ග සොයා ගන්න',
587
- 'description': 'Find expensive routes'
588
- }
589
- ]
590
- },
591
-
592
- # === RECOMMENDATION QUERIES ===
593
- {
594
- 'category': 'Recommendation Queries',
595
- 'examples': [
596
- {
597
- # 'query': 'Recommend cheap routes',
598
- 'query': 'ලාභ මාර්ග නිර්දේශ කරන්න',
599
- 'description': 'Get budget-friendly recommendations'
600
- },
601
- {
602
- # 'query': 'Show me popular destinations',
603
- 'query': 'මට ජනප්‍රිය ගමනාන්ත පෙන්වන්න',
604
- 'description': 'Find frequently traveled routes'
605
- },
606
- {
607
- # 'query': 'What are the best routes from Colombo?',
608
- 'query': 'කොළඹ සිට යාමට හොඳම මාර්ග මොනවාද?',
609
- 'description': 'Get optimal route suggestions'
610
- }
611
- ]
612
- },
613
-
614
- # === STATISTICAL QUERIES ===
615
- {
616
- 'category': 'Statistical Queries',
617
- 'examples': [
618
- {
619
- # 'query': 'What is the average fare?',
620
- 'query': 'සාමාන්‍ය ගාස්තුව කීයද?',
621
- 'description': 'Get average fare statistics'
622
- },
623
- {
624
- # 'query': 'Database statistics',
625
- 'query': 'දත්ත සමුදා සංඛ්යා ලේඛන',
626
- 'description': 'Get comprehensive database overview'
627
- },
628
- {
629
- 'query': 'මාර්ග කීයක් තිබේද?',
630
- 'description': 'Count total routes'
631
- }
632
- ]
633
- },
634
-
635
- # === ROUTE QUERIES ===
636
- {
637
- 'category': 'Route Queries',
638
- 'examples': [
639
- {
640
- # 'query': 'Show me the cheapest routes',
641
- 'query': 'මට ලාභදායී මාර්ග 10ක් පෙන්වන්න',
642
- 'description': 'Find top 10 cheapest routes'
643
- },
644
- {
645
- # 'query': 'Routes from Colombo',
646
- 'query': 'කොළඹ සිට යාමට මාර්ග මොනවාද?',
647
- 'description': 'Find all routes departing from a location'
648
- },
649
- {
650
- # 'query': 'Routes to Galle',
651
- 'query': 'ගාල්ල යාමට මාර්ග මොනවාද?',
652
- 'description': 'Find all routes going to a location'
653
- },
654
- {
655
- # 'query': 'What routes depart from Kandy?',
656
- 'query': 'මහනුවර සිට යාමට මාර්ග මොනවාද?',
657
- 'description': 'Question format for routes'
658
- }
659
- ]
660
- },
661
-
662
- # === SPELLING ERROR EXAMPLES ===
663
- {
664
- 'category': 'Spell Correction Examples',
665
- 'examples': [
666
- {
667
- # 'query': 'price from panadra to gale',
668
- 'query': 'පාන්දුරේ ඉඳන් ගාල්ල්ට කීයක් යනවද?',
669
- 'description': 'Test spell correction (Panadura, Galle)'
670
- },
671
- {
672
- # 'query': 'fare of colmbo to kandee',
673
- 'query': 'කොළ්බ්හ සිට මහනුවර්ට ගාස්තුව කීයද?',
674
- 'description': 'Test spell correction (Colombo, Kandy)'
675
- },
676
- {
677
- # 'query': 'cost from anuradapura to kandy',
678
- 'query': 'අනුරපුර සිට මහනුවර්රට ගාස්තුව කීයද?',
679
- 'description': 'Natural format with correct spelling'
680
- }
681
- ]
682
- }
683
- ]
684
-
685
- return jsonify({
686
- 'success': True,
687
- 'examples': examples
688
- })
689
-
690
- @app.route('/api/nlp/advanced', methods=['POST'])
691
- def advanced_nlp_query():
692
- """Advanced NLP query processing with detailed analysis"""
693
- try:
694
- data = request.get_json()
695
- user_query = data.get('query', '').strip()
696
-
697
- if not user_query:
698
- return jsonify({
699
- 'success': False,
700
- 'message': 'Please provide a query to process.'
701
- })
702
-
703
- # Process with enhanced NLP
704
- result = enhanced_nlp_processor.process_query(user_query)
705
-
706
- return jsonify(result)
707
-
708
- except Exception as e:
709
- return jsonify({
710
- 'success': False,
711
- 'message': f'Error processing advanced NLP query: {str(e)}'
712
- })
713
-
714
- @app.route('/api/nlp/compare', methods=['POST'])
715
- def compare_routes():
716
- """Compare multiple routes"""
717
- try:
718
- data = request.get_json()
719
- routes = data.get('routes', [])
720
-
721
- if len(routes) < 2:
722
- return jsonify({
723
- 'success': False,
724
- 'message': 'Please provide at least 2 routes to compare.'
725
- })
726
-
727
- # Build comparison query
728
- comparison_query = "MATCH "
729
- for i, route in enumerate(routes):
730
- from_loc = route.get('from')
731
- to_loc = route.get('to')
732
- if from_loc and to_loc:
733
- if i > 0:
734
- comparison_query += ", "
735
- comparison_query += f"(a{i}:Place {{name: '{from_loc}'}})-[r{i}:Fare]->(b{i}:Place {{name: '{to_loc}'}})"
736
-
737
- comparison_query += " RETURN "
738
- for i, route in enumerate(routes):
739
- if i > 0:
740
- comparison_query += ", "
741
- comparison_query += f"a{i}.name + ' to ' + b{i}.name as route{i+1}, r{i}.fare as fare{i+1}"
742
-
743
- # Execute query
744
- with neo4j_service.driver.session() as session:
745
- result = session.run(comparison_query)
746
- results = [dict(record) for record in result]
747
-
748
- return jsonify({
749
- 'success': True,
750
- 'data': results,
751
- 'message': f'Comparison of {len(routes)} routes completed'
752
- })
753
-
754
- except Exception as e:
755
- return jsonify({
756
- 'success': False,
757
- 'message': f'Error comparing routes: {str(e)}'
758
- })
759
-
760
- @app.route('/api/nlp/range', methods=['POST'])
761
- def search_by_range():
762
- """Search routes by price range"""
763
- try:
764
- data = request.get_json()
765
- min_price = data.get('min_price')
766
- max_price = data.get('max_price')
767
-
768
- if min_price is None and max_price is None:
769
- return jsonify({
770
- 'success': False,
771
- 'message': 'Please provide min_price or max_price or both.'
772
- })
773
-
774
- # Build range query
775
- range_query = "MATCH (a:Place)-[r:Fare]->(b:Place) WHERE "
776
- conditions = []
777
-
778
- if min_price is not None:
779
- conditions.append(f"r.fare >= {min_price}")
780
- if max_price is not None:
781
- conditions.append(f"r.fare <= {max_price}")
782
-
783
- range_query += " AND ".join(conditions)
784
- range_query += " RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare"
785
-
786
- # Execute query
787
- with neo4j_service.driver.session() as session:
788
- result = session.run(range_query)
789
- results = [dict(record) for record in result]
790
-
791
- return jsonify({
792
- 'success': True,
793
- 'data': results,
794
- 'message': f'Found {len(results)} routes in the specified range'
795
- })
796
-
797
- except Exception as e:
798
- return jsonify({
799
- 'success': False,
800
- 'message': f'Error searching by range: {str(e)}'
801
- })
802
-
803
- @app.route('/api/nlp/test-all-types')
804
- def test_all_query_types():
805
- """Test all query types with live results from Neo4j database"""
806
- try:
807
- # Define test queries for each type
808
- test_queries = {
809
- 'fare_inquiry': [
810
- 'What is the fare from Colombo to Kandy?',
811
- 'fare of anuradhapura to kandy',
812
- 'price from panadura to galle'
813
- ],
814
- 'comparison': [
815
- 'Compare fares from Colombo to Kandy vs Colombo to Galle',
816
- 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?'
817
- ],
818
- 'range_search': [
819
- 'Find routes under 500 rupees',
820
- 'Show me routes between 200 and 800 rupees',
821
- 'Routes over 1000 rupees'
822
- ],
823
- 'recommendation': [
824
- 'Recommend cheap routes',
825
- 'Show me popular destinations',
826
- 'What are the best routes from Colombo?'
827
- ],
828
- 'route_inquiry': [
829
- 'Routes from Colombo',
830
- 'Routes to Galle',
831
- 'What routes depart from Kandy?'
832
- ],
833
- 'statistics': [
834
- 'What is the average fare?',
835
- 'Database statistics',
836
- 'How many routes are there?'
837
- ]
838
- }
839
-
840
- results = {}
841
-
842
- for query_type, queries in test_queries.items():
843
- type_results = []
844
- for query in queries:
845
- try:
846
- # Process with enhanced NLP (uses LLM for Cypher generation)
847
- result = enhanced_nlp_processor.process_query(query)
848
- type_results.append({
849
- 'query': query,
850
- 'result': result,
851
- 'success': result.get('success', False)
852
- })
853
- except Exception as e:
854
- type_results.append({
855
- 'query': query,
856
- 'result': {
857
- 'success': False,
858
- 'message': f'Error processing query: {str(e)}'
859
- },
860
- 'success': False
861
- })
862
-
863
- results[query_type] = {
864
- 'description': f'Test results for {query_type} queries',
865
- 'total_queries': len(queries),
866
- 'successful_queries': sum(1 for r in type_results if r['success']),
867
- 'examples': type_results
868
- }
869
-
870
- # Summary statistics
871
- total_queries = sum(len(queries) for queries in test_queries.values())
872
- total_successful = sum(
873
- results[query_type]['successful_queries']
874
- for query_type in results
875
- )
876
-
877
- return jsonify({
878
- 'success': True,
879
- 'message': f'Tested {total_queries} queries across {len(test_queries)} types. {total_successful} successful.',
880
- 'summary': {
881
- 'total_query_types': len(test_queries),
882
- 'total_queries_tested': total_queries,
883
- 'successful_queries': total_successful,
884
- 'success_rate': round((total_successful / total_queries) * 100, 2) if total_queries > 0 else 0
885
- },
886
- 'results': results,
887
- 'neo4j_connected': neo4j_service.is_connected()
888
- })
889
-
890
- except Exception as e:
891
- return jsonify({
892
- 'success': False,
893
- 'message': f'Error testing query types: {str(e)}',
894
- 'neo4j_connected': neo4j_service.is_connected()
895
- })
896
-
897
- @app.errorhandler(404)
898
- def not_found(error):
899
- return jsonify({
900
- 'success': False,
901
- 'message': 'Endpoint not found'
902
- }), 404
903
-
904
- @app.errorhandler(500)
905
- def internal_error(error):
906
- return jsonify({
907
- 'success': False,
908
- 'message': 'Internal server error'
909
- }), 500
910
-
911
- if __name__ == '__main__':
912
- port = int(os.getenv('PORT', 7860)) # Hugging Face Spaces uses port 7860 by default
913
-
914
- print("🚌 Natural Language Transport Query System")
915
- print("=" * 60)
916
- print(f"🚀 Starting on port {port}")
917
- print(f"🌐 Open your browser and go to: http://localhost:{port}")
918
-
919
- # Check Neo4j connection
920
- if neo4j_service.is_connected():
921
- print("✅ Connected to Neo4j database")
922
- stats = neo4j_service.get_route_statistics()
923
- if stats:
924
- print(f"📊 Database: {stats.get('total_places', 0)} places, {stats.get('total_routes', 0)} routes")
925
- else:
926
- print("⚠️ Neo4j not connected - some features may not work")
927
-
928
- # Check LLM availability
929
- if spell_corrector.llm_available:
930
- print("🤖 LLM integration available for spell correction")
931
- else:
932
- print("⚠️ LLM not available - using fuzzy matching only")
933
-
934
- print("\n🎯 Enhanced Natural Language Capabilities:")
935
- print(" • Multiple query formats (fare, price, cost)")
936
- print(" • Natural language patterns (from X to Y, X to Y fare)")
937
- print(" • Question formats (What is, How much, Show me)")
938
- print(" • Compact formats (Colombo to Kandy fare)")
939
- print(" • Spell correction and fuzzy matching")
940
- print(" • LLM-powered query interpretation")
941
- print(" • Automatic Cypher query generation")
942
- print(" • Advanced intent classification")
943
- print(" • Entity extraction and normalization")
944
- print(" • Comparison queries (vs, versus, compare)")
945
- print(" • Range search queries (under, over, between)")
946
- print(" • Recommendation queries (recommend, suggest)")
947
- print(" • Confidence scoring for query understanding")
948
- print(" • Sinhala language support with translation")
949
- print(" • Automatic Sinhala-English translation")
950
- print(" • Dictionary-based and Google Translate fallback")
951
-
952
- print("\n🔗 Available API Endpoints:")
953
- print(" • /api/query - Process natural language queries (enhanced NLP)")
954
- print(" /api/nlp/capabilities - View enhanced NLP capabilities with live examples")
955
- print(" • /api/nlp/test-all-types - Test all query types with live results")
956
- print(" • /api/nlp/test - Test queries with detailed analysis")
957
- print(" • /api/nlp/demo - Get comprehensive demo queries")
958
- print(" • /api/examples - Get categorized example queries")
959
- print(" • /api/sinhala/examples - Get Sinhala example queries")
960
- print(" • /api/translation/test - Test translation functionality")
961
- print(" • /api/translation/translate - Translate text between languages")
962
- print(" • /api/status - System status and statistics")
963
- print(" • /api/suggestions - Get location suggestions")
964
- print(" • /api/places - Get all available places")
965
-
966
- print("=" * 60)
967
-
968
- try:
969
- app.run(debug=False, port=port, host='0.0.0.0') # Set debug=False for production
970
- except Exception as e:
971
- print(f"❌ Error starting application: {e}")
972
- print("💡 Try running as administrator or check if another application is using the port")
973
-
974
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main Flask Application for Transport Query System
4
+ """
5
+
6
+ from flask import Flask, render_template, request, jsonify, session
7
+ import os
8
+ from llm_query_processor import LLMQueryProcessor
9
+ from enhanced_nlp_processor import EnhancedNLPProcessor
10
+ from spell_corrector import SpellCorrector
11
+ from neo4j_service import Neo4jService
12
+ from translation_service import TranslationService
13
+ from logger import get_logger
14
+ from config import Config
15
+
16
+ app = Flask(__name__)
17
+ app.config.from_object(Config)
18
+ logger = get_logger("FlaskApp")
19
+
20
+ # Initialize services
21
+ query_processor = LLMQueryProcessor()
22
+ enhanced_nlp_processor = EnhancedNLPProcessor()
23
+ spell_corrector = SpellCorrector()
24
+ neo4j_service = Neo4jService()
25
+ translation_service = TranslationService()
26
+
27
+ @app.route('/')
28
+ def index():
29
+ """Main page"""
30
+ return render_template('index.html')
31
+
32
+ @app.route('/api/query', methods=['POST'])
33
+ def process_query():
34
+ """Process user query with enhanced NLP and translation support"""
35
+ try:
36
+ data = request.get_json()
37
+ user_query = data.get('query', '').strip()
38
+ use_enhanced_nlp = data.get('enhanced_nlp', True) # Default to enhanced NLP
39
+
40
+ if not user_query:
41
+ return jsonify({
42
+ 'success': False,
43
+ 'message': 'Please enter a query.'
44
+ })
45
+
46
+ # Auto-detect language and translate if needed
47
+ translation_info = translation_service.translate_query(user_query)
48
+
49
+ # Use translated query for processing
50
+ query_to_process = translation_info['translated_query']
51
+
52
+ # Log translation info to console
53
+ detected_lang = translation_info.get('detected_language', 'english')
54
+ if detected_lang != 'english':
55
+ logger.info(f"Translation: {detected_lang}->en method={translation_info['translation_method']} original='{translation_info['original_query']}' translated='{translation_info['translated_query']}'")
56
+ else:
57
+ logger.info(f"Processing English Query: '{user_query}'")
58
+
59
+ # Process the query with enhanced NLP or fallback to basic processor
60
+ if use_enhanced_nlp:
61
+ result = enhanced_nlp_processor.process_query(query_to_process)
62
+ else:
63
+ result = query_processor.process_query(query_to_process)
64
+
65
+ # If original query was not in English, translate the response back
66
+ detected_lang = translation_info.get('detected_language', 'english')
67
+ if detected_lang != 'english':
68
+ print(f" English Response: {result.get('message', 'No message')}")
69
+ result = translation_service.translate_response(result, detected_lang)
70
+ result['translation_info'] = translation_info
71
+ print(f" {detected_lang.title()} Response: {result.get('message', 'No message')}")
72
+ print(f" Translation Complete ")
73
+
74
+ logger.info(f"Response success={result.get('success')} type={result.get('query_type','n/a')} message='{result.get('message','')[:120]}'")
75
+ return jsonify(result)
76
+
77
+ except Exception as e:
78
+ return jsonify({
79
+ 'success': False,
80
+ 'message': f'Error processing query: {str(e)}'
81
+ })
82
+
83
+ @app.route('/api/suggestions', methods=['POST'])
84
+ def get_suggestions():
85
+ """Get location suggestions for autocomplete"""
86
+ try:
87
+ data = request.get_json()
88
+ partial_location = data.get('location', '').strip()
89
+
90
+ if not partial_location:
91
+ return jsonify({'suggestions': []})
92
+
93
+ suggestions = spell_corrector.get_suggestions(partial_location)
94
+
95
+ return jsonify({
96
+ 'suggestions': [{'name': name, 'confidence': conf} for name, conf in suggestions]
97
+ })
98
+
99
+ except Exception as e:
100
+ return jsonify({
101
+ 'success': False,
102
+ 'message': f'Error getting suggestions: {str(e)}'
103
+ })
104
+
105
+ @app.route('/api/status')
106
+ def get_status():
107
+ """Get system status"""
108
+ try:
109
+ neo4j_connected = neo4j_service.is_connected()
110
+ places = neo4j_service.get_all_places() if neo4j_connected else []
111
+ stats = neo4j_service.get_route_statistics() if neo4j_connected else {}
112
+
113
+ return jsonify({
114
+ 'neo4j_connected': neo4j_connected,
115
+ 'total_places': len(places),
116
+ 'statistics': stats
117
+ })
118
+
119
+ except Exception as e:
120
+ return jsonify({
121
+ 'success': False,
122
+ 'message': f'Error getting status: {str(e)}'
123
+ })
124
+
125
+ @app.route('/api/places')
126
+ def get_places():
127
+ """Get all available places"""
128
+ try:
129
+ places = neo4j_service.get_all_places()
130
+ return jsonify({
131
+ 'success': True,
132
+ 'places': places
133
+ })
134
+
135
+ except Exception as e:
136
+ return jsonify({
137
+ 'success': False,
138
+ 'message': f'Error getting places: {str(e)}'
139
+ })
140
+
141
+ @app.route('/api/sinhala/examples')
142
+ def get_sinhala_examples():
143
+ """Get example queries in Sinhala"""
144
+ try:
145
+ sinhala_examples = translation_service.get_sinhala_examples()
146
+ return jsonify({
147
+ 'success': True,
148
+ 'examples': sinhala_examples
149
+ })
150
+
151
+ except Exception as e:
152
+ return jsonify({
153
+ 'success': False,
154
+ 'message': f'Error getting Sinhala examples: {str(e)}'
155
+ })
156
+
157
+ @app.route('/api/tamil/examples')
158
+ def get_tamil_examples():
159
+ """Get example queries in Tamil"""
160
+ try:
161
+ tamil_examples = translation_service.get_tamil_examples()
162
+ return jsonify({
163
+ 'success': True,
164
+ 'examples': tamil_examples
165
+ })
166
+
167
+ except Exception as e:
168
+ return jsonify({
169
+ 'success': False,
170
+ 'message': f'Error getting Tamil examples: {str(e)}'
171
+ })
172
+
173
+ @app.route('/api/language/detect', methods=['POST'])
174
+ def detect_language():
175
+ """Detect the language of input text"""
176
+ try:
177
+ data = request.get_json()
178
+ text = data.get('text', '').strip()
179
+
180
+ if not text:
181
+ return jsonify({
182
+ 'success': False,
183
+ 'message': 'Please provide text to detect language.'
184
+ })
185
+
186
+ detection_result = translation_service.language_detector.detect_language(text)
187
+
188
+ return jsonify({
189
+ 'success': True,
190
+ 'detection_result': detection_result
191
+ })
192
+
193
+ except Exception as e:
194
+ return jsonify({
195
+ 'success': False,
196
+ 'message': f'Error detecting language: {str(e)}'
197
+ })
198
+
199
+ @app.route('/api/translation/test')
200
+ def test_translation():
201
+ """Test translation functionality"""
202
+ try:
203
+ test_results = translation_service.test_translation()
204
+ return jsonify({
205
+ 'success': True,
206
+ 'test_results': test_results
207
+ })
208
+
209
+ except Exception as e:
210
+ return jsonify({
211
+ 'success': False,
212
+ 'message': f'Error testing translation: {str(e)}'
213
+ })
214
+
215
+ @app.route('/api/translation/translate', methods=['POST'])
216
+ def translate_text():
217
+ """Translate text between supported languages (Sinhala, Tamil, Singlish, English)"""
218
+ try:
219
+ data = request.get_json()
220
+ text = data.get('text', '').strip()
221
+ target_lang = data.get('target_lang', 'en') # 'en', 'si', 'ta'
222
+ source_lang = data.get('source_lang', 'auto')
223
+
224
+ if not text:
225
+ return jsonify({
226
+ 'success': False,
227
+ 'message': 'Please provide text to translate.'
228
+ })
229
+
230
+ # Detect source language if auto
231
+ detection_result = translation_service.language_detector.detect_language(text)
232
+ detected_language = detection_result['language']
233
+
234
+ # Map detected language to language code
235
+ if detected_language == 'sinhala':
236
+ detected_lang_code = 'si'
237
+ elif detected_language == 'tamil':
238
+ detected_lang_code = 'ta'
239
+ elif detected_language == 'singlish':
240
+ detected_lang_code = 'si' # Treat Singlish as Sinhala for translation
241
+ else:
242
+ detected_lang_code = 'en'
243
+
244
+ translated_text = translation_service.translate_text(text, target_lang, source_lang)
245
+
246
+ return jsonify({
247
+ 'success': True,
248
+ 'original_text': text,
249
+ 'translated_text': translated_text,
250
+ 'detected_language': detected_language,
251
+ 'source_language': detected_lang_code,
252
+ 'target_language': target_lang,
253
+ 'translation_method': translation_service.last_translation_method or 'dictionary',
254
+ 'detection_confidence': detection_result['confidence']
255
+ })
256
+
257
+ except Exception as e:
258
+ return jsonify({
259
+ 'success': False,
260
+ 'message': f'Error translating text: {str(e)}'
261
+ })
262
+
263
+ @app.route('/api/nlp/capabilities')
264
+ def get_nlp_capabilities():
265
+ """Get information about natural language processing capabilities with live examples"""
266
+
267
+ # Test queries for each type to demonstrate actual results
268
+ test_queries = [
269
+ {
270
+ 'type': 'fare_inquiry',
271
+ 'description': 'Find fare between two specific locations',
272
+ 'examples': [
273
+ 'What is the fare from Colombo to Kandy?',
274
+ 'fare of anuradhapura to kandy',
275
+ 'price from panadura to galle',
276
+ 'Colombo to Kandy fare'
277
+ ]
278
+ },
279
+ {
280
+ 'type': 'comparison',
281
+ 'description': 'Compare fares between different routes',
282
+ 'examples': [
283
+ 'Compare fares from Colombo to Kandy vs Colombo to Galle',
284
+ 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?',
285
+ 'What is the difference in fare between Panadura to Galle and Panadura to Matara?'
286
+ ]
287
+ },
288
+ {
289
+ 'type': 'range_search',
290
+ 'description': 'Find routes within specific price ranges',
291
+ 'examples': [
292
+ 'Find routes under 500 rupees',
293
+ 'Show me routes between 200 and 800 rupees',
294
+ 'Routes over 1000 rupees'
295
+ ]
296
+ },
297
+ {
298
+ 'type': 'recommendation',
299
+ 'description': 'Get route recommendations based on criteria',
300
+ 'examples': [
301
+ 'Recommend cheap routes',
302
+ 'Show me popular destinations',
303
+ 'What are the best routes from Colombo?'
304
+ ]
305
+ },
306
+ {
307
+ 'type': 'route_inquiry',
308
+ 'description': 'Find routes from/to specific locations',
309
+ 'examples': [
310
+ 'Routes from Colombo',
311
+ 'Routes to Galle',
312
+ 'What routes depart from Kandy?'
313
+ ]
314
+ },
315
+ {
316
+ 'type': 'statistics',
317
+ 'description': 'Get database overview and statistics',
318
+ 'examples': [
319
+ 'What is the average fare?',
320
+ 'Database statistics',
321
+ 'How many routes are there?'
322
+ ]
323
+ }
324
+ ]
325
+
326
+ # Process each test query to get actual results
327
+ live_examples = []
328
+ for query_type in test_queries:
329
+ type_examples = []
330
+ for example_query in query_type['examples'][:2]: # Test first 2 examples
331
+ try:
332
+ result = enhanced_nlp_processor.process_query(example_query)
333
+ type_examples.append({
334
+ 'query': example_query,
335
+ 'result': result
336
+ })
337
+ except Exception as e:
338
+ type_examples.append({
339
+ 'query': example_query,
340
+ 'result': {
341
+ 'success': False,
342
+ 'message': f'Error: {str(e)}'
343
+ }
344
+ })
345
+
346
+ live_examples.append({
347
+ 'type': query_type['type'],
348
+ 'description': query_type['description'],
349
+ 'examples': type_examples
350
+ })
351
+
352
+ capabilities = {
353
+ 'natural_language_processing': {
354
+ 'description': 'Advanced NLP for transport queries with enhanced understanding',
355
+ 'features': [
356
+ 'Multiple query formats (fare, price, cost)',
357
+ 'Natural language patterns (from X to Y, X to Y fare, etc.)',
358
+ 'Question formats (What is, How much, Show me, etc.)',
359
+ 'Compact formats (Colombo to Kandy fare)',
360
+ 'Spell correction and fuzzy matching',
361
+ 'Automatic location name correction',
362
+ 'LLM-powered query interpretation',
363
+ 'Fallback keyword-based processing',
364
+ 'Advanced intent classification',
365
+ 'Entity extraction and normalization',
366
+ 'Confidence scoring for query understanding'
367
+ ]
368
+ },
369
+ 'query_types': test_queries,
370
+ 'live_examples': live_examples,
371
+ 'spell_correction': {
372
+ 'description': 'Automatic location name correction',
373
+ 'methods': [
374
+ 'Direct mapping (exact matches)',
375
+ 'Fuzzy matching (similar names)',
376
+ 'LLM correction (AI-powered)',
377
+ 'Partial matching (substring matching)'
378
+ ],
379
+ 'examples': [
380
+ 'panadra → Panadura',
381
+ 'gale Galle',
382
+ 'colmbo → Colombo',
383
+ 'kandee → Kandy'
384
+ ]
385
+ },
386
+ 'llm_integration': {
387
+ 'description': 'AI-powered query interpretation with LLM Cypher generation',
388
+ 'features': [
389
+ 'Automatic query type detection',
390
+ 'LLM-powered Cypher query generation',
391
+ 'Natural language understanding',
392
+ 'Fallback to keyword-based processing',
393
+ 'Advanced entity extraction',
394
+ 'Intent classification with confidence scoring',
395
+ 'Real-time database querying'
396
+ ]
397
+ },
398
+ 'enhanced_features': {
399
+ 'description': 'Advanced NLP capabilities',
400
+ 'features': [
401
+ 'Multi-intent query understanding',
402
+ 'Context-aware responses',
403
+ 'Query preprocessing and normalization',
404
+ 'Advanced pattern matching',
405
+ 'Confidence-based result ranking',
406
+ 'Comprehensive query analysis',
407
+ 'Live database results for all query types'
408
+ ]
409
+ }
410
+ }
411
+
412
+ return jsonify({
413
+ 'success': True,
414
+ 'capabilities': capabilities
415
+ })
416
+
417
+ @app.route('/api/nlp/test', methods=['POST'])
418
+ def test_nlp_query():
419
+ """Test a natural language query and return detailed analysis"""
420
+ try:
421
+ data = request.get_json()
422
+ user_query = data.get('query', '').strip()
423
+ use_enhanced_nlp = data.get('enhanced_nlp', True)
424
+
425
+ if not user_query:
426
+ return jsonify({
427
+ 'success': False,
428
+ 'message': 'Please provide a query to test.'
429
+ })
430
+
431
+ # Get detailed analysis
432
+ analysis = {
433
+ 'original_query': user_query,
434
+ 'processing_steps': []
435
+ }
436
+
437
+ # Step 1: Extract locations
438
+ locations = spell_corrector.extract_locations_from_query(user_query)
439
+ analysis['processing_steps'].append({
440
+ 'step': 'Location Extraction',
441
+ 'locations_found': len(locations),
442
+ 'details': [
443
+ {
444
+ 'original': loc[0],
445
+ 'corrected': loc[1],
446
+ 'confidence': loc[2],
447
+ 'method': loc[3]
448
+ } for loc in locations
449
+ ]
450
+ })
451
+
452
+ # Step 2: Process query with enhanced NLP
453
+ if use_enhanced_nlp:
454
+ result = enhanced_nlp_processor.process_query(user_query)
455
+ analysis['processing_steps'].append({
456
+ 'step': 'Enhanced NLP Processing',
457
+ 'success': result.get('success', False),
458
+ 'query_type': result.get('query_type', 'unknown'),
459
+ 'message': result.get('message', ''),
460
+ 'confidence': result.get('query_analysis', {}).get('confidence', 0),
461
+ 'intent': result.get('query_analysis', {}).get('intent', {}),
462
+ 'entities': result.get('query_analysis', {}).get('entities', {})
463
+ })
464
+ else:
465
+ result = query_processor.process_query(user_query)
466
+ analysis['processing_steps'].append({
467
+ 'step': 'Basic Query Processing',
468
+ 'success': result.get('success', False),
469
+ 'query_type': result.get('query_type', 'unknown'),
470
+ 'message': result.get('message', ''),
471
+ 'cypher_query': result.get('cypher_query', ''),
472
+ 'corrections': result.get('corrections', [])
473
+ })
474
+
475
+ # Step 3: Results
476
+ if result.get('success') and result.get('data'):
477
+ analysis['processing_steps'].append({
478
+ 'step': 'Database Results',
479
+ 'results_count': len(result['data']),
480
+ 'sample_results': result['data'][:3] # Show first 3 results
481
+ })
482
+
483
+ return jsonify({
484
+ 'success': True,
485
+ 'analysis': analysis,
486
+ 'result': result
487
+ })
488
+
489
+ except Exception as e:
490
+ return jsonify({
491
+ 'success': False,
492
+ 'message': f'Error testing NLP query: {str(e)}'
493
+ })
494
+
495
+ @app.route('/api/nlp/demo')
496
+ def get_nlp_demo():
497
+ """Get a comprehensive demo of natural language capabilities"""
498
+ demo_queries = [
499
+ {
500
+ 'category': 'Basic Fare Queries',
501
+ 'queries': [
502
+ 'What is the fare from Colombo to Kandy?',
503
+ 'fare of anuradhapura to kandy',
504
+ 'price from panadura to galle',
505
+ 'Colombo to Kandy fare'
506
+ ]
507
+ },
508
+ {
509
+ 'category': 'Comparison Queries',
510
+ 'queries': [
511
+ 'Compare fares from Colombo to Kandy vs Colombo to Galle',
512
+ 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?',
513
+ 'What is the difference in fare between Panadura to Galle and Panadura to Matara?'
514
+ ]
515
+ },
516
+ {
517
+ 'category': 'Range Search Queries',
518
+ 'queries': [
519
+ 'Find routes under 500 rupees',
520
+ 'Show me routes between 200 and 800 rupees',
521
+ 'Routes over 1000 rupees'
522
+ ]
523
+ },
524
+ {
525
+ 'category': 'Recommendation Queries',
526
+ 'queries': [
527
+ 'Recommend cheap routes',
528
+ 'Show me popular destinations',
529
+ 'What are the best routes from Colombo?'
530
+ ]
531
+ },
532
+ {
533
+ 'category': 'Statistical Queries',
534
+ 'queries': [
535
+ 'What is the average fare?',
536
+ 'Database statistics',
537
+ 'How many routes are there?'
538
+ ]
539
+ },
540
+ {
541
+ 'category': 'Route Queries',
542
+ 'queries': [
543
+ 'Show me the cheapest routes',
544
+ 'Routes from Colombo',
545
+ 'Routes to Galle',
546
+ 'What routes depart from Kandy?'
547
+ ]
548
+ },
549
+ {
550
+ 'category': 'Spell Correction Tests',
551
+ 'queries': [
552
+ 'price from panadra to gale',
553
+ 'fare of colmbo to kandee',
554
+ 'cost from anuradapura to kandy'
555
+ ]
556
+ }
557
+ ]
558
+
559
+ return jsonify({
560
+ 'success': True,
561
+ 'demo': {
562
+ 'title': 'Enhanced Natural Language Transport Query Demo',
563
+ 'description': 'Advanced NLP capabilities with comparison, range search, and recommendations',
564
+ 'categories': demo_queries
565
+ }
566
+ })
567
+
568
+ @app.route('/api/examples')
569
+ def get_examples():
570
+ """Get comprehensive example queries showcasing natural language capabilities"""
571
+ examples = [
572
+ # === SINHALA FARE QUERIES ===
573
+ {
574
+ 'category': 'Sinhala Fare Queries (සිංහල)',
575
+ 'examples': [
576
+ {
577
+ 'query': 'කොළඹ සිට මහනුවරට ගාස්තුව කීයද?',
578
+ 'description': 'Standard fare query format'
579
+ },
580
+ {
581
+ 'query': 'පානදුරේ ඉඳන් ගාල්ලට කීයක් යනවද?',
582
+ 'description': 'Alternative way to ask for fare'
583
+ },
584
+ {
585
+ 'query': 'අනුරාධපුර සිට මහනුවර දක්වා ගාස්තුව',
586
+ 'description': 'Natural language format'
587
+ },
588
+ {
589
+ 'query': 'මහනුවර සිට මාතරට ගාස්තුව කීයද?',
590
+ 'description': 'Question format'
591
+ }
592
+ ]
593
+ },
594
+
595
+ # === TAMIL FARE QUERIES ===
596
+ {
597
+ 'category': 'Tamil Fare Queries (தமிழ்)',
598
+ 'examples': [
599
+ {
600
+ 'query': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?',
601
+ 'description': 'Standard Tamil fare query'
602
+ },
603
+ {
604
+ 'query': 'ம��த்தறை இருந்து காலி வரை விலை எவ்வளவு?',
605
+ 'description': 'Alternative Tamil fare query'
606
+ },
607
+ {
608
+ 'query': 'அனுராதபுரம் இருந்து கொழும்பு வரை கட்டணம்',
609
+ 'description': 'Tamil natural language format'
610
+ },
611
+ {
612
+ 'query': 'பனதுரை இருந்து காலி வரை பேருந்து கட்டணம் எவ்வளவு?',
613
+ 'description': 'Tamil question format'
614
+ }
615
+ ]
616
+ },
617
+
618
+ # === SINGLISH FARE QUERIES ===
619
+ {
620
+ 'category': 'Singlish Fare Queries (Mixed)',
621
+ 'examples': [
622
+ {
623
+ 'query': 'කොළඹ සිට Kandy ගාස්තුව කීයද?',
624
+ 'description': 'Sinhala-English mixed query'
625
+ },
626
+ {
627
+ 'query': 'Colombo සිට ගාල්ලට bus fare කීයද?',
628
+ 'description': 'English-Sinhala mixed query'
629
+ },
630
+ {
631
+ 'query': 'කොළඹ සිට Panadura දක්වා price කීයද?',
632
+ 'description': 'Mixed language with English terms'
633
+ },
634
+ {
635
+ 'query': 'Galle සිට මාතරට ticket cost කීයද?',
636
+ 'description': 'Mixed language fare query'
637
+ }
638
+ ]
639
+ },
640
+
641
+ # === ENGLISH FARE QUERIES ===
642
+ {
643
+ 'category': 'English Fare Queries',
644
+ 'examples': [
645
+ {
646
+ 'query': 'What is the fare from Colombo to Kandy?',
647
+ 'description': 'Standard English fare query'
648
+ },
649
+ {
650
+ 'query': 'How much is the bus fare from Panadura to Galle?',
651
+ 'description': 'English question format'
652
+ },
653
+ {
654
+ 'query': 'Price from Anuradhapura to Kandy',
655
+ 'description': 'Compact English format'
656
+ },
657
+ {
658
+ 'query': 'Show me the cost from Matara to Colombo',
659
+ 'description': 'English request format'
660
+ }
661
+ ]
662
+ },
663
+
664
+ # === COMPARISON QUERIES ===
665
+ {
666
+ 'category': 'Sinhala Comparison Queries (සිංහල)',
667
+ 'examples': [
668
+ {
669
+ 'query': 'කොළඹ සිට මහනුවර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සංසන්දනය කරන්න.',
670
+ 'description': 'Compare two different routes'
671
+ },
672
+ {
673
+ 'query': 'කොළඹ සිට මහනුවර දක්වා සහ කොළඹ සිට අනුරාධපුර දක්වා ලාභදායී වන්නේ කුමක්ද?',
674
+ 'description': 'Find the cheaper option'
675
+ }
676
+ ]
677
+ },
678
+
679
+ {
680
+ 'category': 'Tamil Comparison Queries (தமிழ்)',
681
+ 'examples': [
682
+ {
683
+ 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு.',
684
+ 'description': 'Compare two different routes in Tamil'
685
+ },
686
+ {
687
+ 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து அனுராதபுரம் வரை கட்டணத்தின் வித்தியாசம் எவ்வளவு?',
688
+ 'description': 'Calculate fare difference in Tamil'
689
+ }
690
+ ]
691
+ },
692
+
693
+ {
694
+ 'category': 'Singlish Comparison Queries (Mixed)',
695
+ 'examples': [
696
+ {
697
+ 'query': 'කොළඹ සිට Kandy සහ Colombo සිට Galle fares compare කරන්න.',
698
+ 'description': 'Mixed language comparison'
699
+ },
700
+ {
701
+ 'query': 'Colombo සිට මහනුවර සහ Colombo සිට අනුරාධපුර cheaper කුමක්ද?',
702
+ 'description': 'Mixed language cheaper option'
703
+ }
704
+ ]
705
+ },
706
+
707
+ # === RANGE SEARCH QUERIES ===
708
+ {
709
+ 'category': 'Sinhala Range Queries (සිංහල)',
710
+ 'examples': [
711
+ {
712
+ 'query': 'රුපියල් 500ට අඩු මාර්ග සොයා ගන්න',
713
+ 'description': 'Find affordable routes'
714
+ },
715
+ {
716
+ 'query': 'රුපියල් 200 සහ 800 අතර මාර්ග සොයා ගන්න',
717
+ 'description': 'Find routes in price range'
718
+ }
719
+ ]
720
+ },
721
+
722
+ {
723
+ 'category': 'Tamil Range Queries (தமிழ்)',
724
+ 'examples': [
725
+ {
726
+ 'query': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை கண்டுபிடி',
727
+ 'description': 'Find affordable routes in Tamil'
728
+ },
729
+ {
730
+ 'query': 'ரூபாய் 200 மற்றும் 800 இடையில் கட்டணம் உள்ள பாதைகளை காட்டு',
731
+ 'description': 'Find routes in price range in Tamil'
732
+ }
733
+ ]
734
+ },
735
+
736
+ {
737
+ 'category': 'Singlish Range Queries (Mixed)',
738
+ 'examples': [
739
+ {
740
+ 'query': 'රුපියල් 500ට අඩු routes find කරන්න',
741
+ 'description': 'Mixed language range search'
742
+ },
743
+ {
744
+ 'query': 'Rs. 200 සහ 800 අතර මාර්ග show කරන්න',
745
+ 'description': 'Mixed language price range'
746
+ }
747
+ ]
748
+ },
749
+
750
+ # === RECOMMENDATION QUERIES ===
751
+ {
752
+ 'category': 'Sinhala Recommendation Queries (සිංහල)',
753
+ 'examples': [
754
+ {
755
+ 'query': 'ලාභ මාර්ග නිර්දේශ කරන්න',
756
+ 'description': 'Get budget-friendly recommendations'
757
+ },
758
+ {
759
+ 'query': 'මට ජනප්‍රිය ගමනාන්ත පෙන්වන්න',
760
+ 'description': 'Find frequently traveled routes'
761
+ }
762
+ ]
763
+ },
764
+
765
+ {
766
+ 'category': 'Tamil Recommendation Queries (தமிழ்)',
767
+ 'examples': [
768
+ {
769
+ 'query': 'குறைந்த விலையில் பாதைகளை பரிந்துரை',
770
+ 'description': 'Get budget-friendly recommendations in Tamil'
771
+ },
772
+ {
773
+ 'query': 'பிரபலமான இலக்குகளை காட்டு',
774
+ 'description': 'Find frequently traveled routes in Tamil'
775
+ }
776
+ ]
777
+ },
778
+
779
+ {
780
+ 'category': 'Singlish Recommendation Queries (Mixed)',
781
+ 'examples': [
782
+ {
783
+ 'query': 'ලාභ routes recommend කරන්න',
784
+ 'description': 'Mixed language recommendations'
785
+ },
786
+ {
787
+ 'query': 'Popular destinations show කරන්න',
788
+ 'description': 'Mixed language popular routes'
789
+ }
790
+ ]
791
+ },
792
+
793
+ # === STATISTICAL QUERIES ===
794
+ {
795
+ 'category': 'Sinhala Statistical Queries (සිංහල)',
796
+ 'examples': [
797
+ {
798
+ 'query': 'සාමාන්‍ය ගාස්තුව කීයද?',
799
+ 'description': 'Get average fare statistics'
800
+ },
801
+ {
802
+ 'query': 'දත්ත සමුදා සංඛ්යා ලේඛන',
803
+ 'description': 'Get comprehensive database overview'
804
+ }
805
+ ]
806
+ },
807
+
808
+ {
809
+ 'category': 'Tamil Statistical Queries (தமிழ்)',
810
+ 'examples': [
811
+ {
812
+ 'query': 'சராசரி கட்டணம் எவ்வளவு?',
813
+ 'description': 'Get average fare statistics in Tamil'
814
+ },
815
+ {
816
+ 'query': 'தரவு சேமிப்பக புள்ளிவிவரங்கள்',
817
+ 'description': 'Get comprehensive database overview in Tamil'
818
+ }
819
+ ]
820
+ },
821
+
822
+ {
823
+ 'category': 'Singlish Statistical Queries (Mixed)',
824
+ 'examples': [
825
+ {
826
+ 'query': 'Average fare කීයද?',
827
+ 'description': 'Mixed language statistics'
828
+ },
829
+ {
830
+ 'query': 'Database statistics show කරන්න',
831
+ 'description': 'Mixed language database overview'
832
+ }
833
+ ]
834
+ },
835
+
836
+ # === ROUTE QUERIES ===
837
+ {
838
+ 'category': 'Sinhala Route Queries (සිංහල)',
839
+ 'examples': [
840
+ {
841
+ 'query': 'මට ලාභදායී මාර්ග 10ක් පෙන්වන්න',
842
+ 'description': 'Find top 10 cheapest routes'
843
+ },
844
+ {
845
+ 'query': 'කොළඹ සිට යාමට මාර්ග මොනවාද?',
846
+ 'description': 'Find all routes departing from a location'
847
+ }
848
+ ]
849
+ },
850
+
851
+ {
852
+ 'category': 'Tamil Route Queries (தமிழ்)',
853
+ 'examples': [
854
+ {
855
+ 'query': 'குறைந்த விலையில் பாதைகள் 10 காட்டு',
856
+ 'description': 'Find top 10 cheapest routes in Tamil'
857
+ },
858
+ {
859
+ 'query': 'கொழும்பு இருந்து போகும் பாதைகள் என்ன?',
860
+ 'description': 'Find all routes departing from a location in Tamil'
861
+ }
862
+ ]
863
+ },
864
+
865
+ {
866
+ 'category': 'Singlish Route Queries (Mixed)',
867
+ 'examples': [
868
+ {
869
+ 'query': 'ලාභදායී routes 10ක් show කරන්න',
870
+ 'description': 'Mixed language cheapest routes'
871
+ },
872
+ {
873
+ 'query': 'Colombo සිට යන මාර්ග මොනවාද?',
874
+ 'description': 'Mixed language route queries'
875
+ }
876
+ ]
877
+ },
878
+
879
+ # === SPELLING ERROR EXAMPLES ===
880
+ {
881
+ 'category': 'Sinhala Spell Correction (සිංහල)',
882
+ 'examples': [
883
+ {
884
+ 'query': 'පාන්දුරේ ඉඳන් ගාල්ල්ට කීයක් යනවද?',
885
+ 'description': 'Test spell correction (Panadura, Galle)'
886
+ },
887
+ {
888
+ 'query': 'කොළ්බ්හ සිට මහනුවර්ට ගාස්තුව කීයද?',
889
+ 'description': 'Test spell correction (Colombo, Kandy)'
890
+ }
891
+ ]
892
+ },
893
+
894
+ {
895
+ 'category': 'Tamil Spell Correction (தமிழ்)',
896
+ 'examples': [
897
+ {
898
+ 'query': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?',
899
+ 'description': 'Test Tamil spell correction'
900
+ },
901
+ {
902
+ 'query': 'பனதுரை இருந்து காலி வரை விலை எவ்வளவு?',
903
+ 'description': 'Test Tamil with common variations'
904
+ }
905
+ ]
906
+ },
907
+
908
+ {
909
+ 'category': 'Singlish Spell Correction (Mixed)',
910
+ 'examples': [
911
+ {
912
+ 'query': 'කොළඹ සිට Kandy ගාස්තුව කීයද?',
913
+ 'description': 'Test mixed language spell correction'
914
+ },
915
+ {
916
+ 'query': 'Colombo සිට ගාල්ලට bus fare කීයද?',
917
+ 'description': 'Test Singlish with English terms'
918
+ }
919
+ ]
920
+ }
921
+ ]
922
+
923
+ return jsonify({
924
+ 'success': True,
925
+ 'examples': examples
926
+ })
927
+
928
+ @app.route('/api/nlp/advanced', methods=['POST'])
929
+ def advanced_nlp_query():
930
+ """Advanced NLP query processing with detailed analysis"""
931
+ try:
932
+ data = request.get_json()
933
+ user_query = data.get('query', '').strip()
934
+
935
+ if not user_query:
936
+ return jsonify({
937
+ 'success': False,
938
+ 'message': 'Please provide a query to process.'
939
+ })
940
+
941
+ # Process with enhanced NLP
942
+ result = enhanced_nlp_processor.process_query(user_query)
943
+
944
+ return jsonify(result)
945
+
946
+ except Exception as e:
947
+ return jsonify({
948
+ 'success': False,
949
+ 'message': f'Error processing advanced NLP query: {str(e)}'
950
+ })
951
+
952
+ @app.route('/api/nlp/compare', methods=['POST'])
953
+ def compare_routes():
954
+ """Compare multiple routes"""
955
+ try:
956
+ data = request.get_json()
957
+ routes = data.get('routes', [])
958
+
959
+ if len(routes) < 2:
960
+ return jsonify({
961
+ 'success': False,
962
+ 'message': 'Please provide at least 2 routes to compare.'
963
+ })
964
+
965
+ # Build comparison query
966
+ comparison_query = "MATCH "
967
+ for i, route in enumerate(routes):
968
+ from_loc = route.get('from')
969
+ to_loc = route.get('to')
970
+ if from_loc and to_loc:
971
+ if i > 0:
972
+ comparison_query += ", "
973
+ comparison_query += f"(a{i}:Place {{name: '{from_loc}'}})-[r{i}:Fare]->(b{i}:Place {{name: '{to_loc}'}})"
974
+
975
+ comparison_query += " RETURN "
976
+ for i, route in enumerate(routes):
977
+ if i > 0:
978
+ comparison_query += ", "
979
+ comparison_query += f"a{i}.name + ' to ' + b{i}.name as route{i+1}, r{i}.fare as fare{i+1}"
980
+
981
+ # Execute query
982
+ with neo4j_service.driver.session() as session:
983
+ result = session.run(comparison_query)
984
+ results = [dict(record) for record in result]
985
+
986
+ return jsonify({
987
+ 'success': True,
988
+ 'data': results,
989
+ 'message': f'Comparison of {len(routes)} routes completed'
990
+ })
991
+
992
+ except Exception as e:
993
+ return jsonify({
994
+ 'success': False,
995
+ 'message': f'Error comparing routes: {str(e)}'
996
+ })
997
+
998
+ @app.route('/api/nlp/range', methods=['POST'])
999
+ def search_by_range():
1000
+ """Search routes by price range"""
1001
+ try:
1002
+ data = request.get_json()
1003
+ min_price = data.get('min_price')
1004
+ max_price = data.get('max_price')
1005
+
1006
+ if min_price is None and max_price is None:
1007
+ return jsonify({
1008
+ 'success': False,
1009
+ 'message': 'Please provide min_price or max_price or both.'
1010
+ })
1011
+
1012
+ # Build range query
1013
+ range_query = "MATCH (a:Place)-[r:Fare]->(b:Place) WHERE "
1014
+ conditions = []
1015
+
1016
+ if min_price is not None:
1017
+ conditions.append(f"r.fare >= {min_price}")
1018
+ if max_price is not None:
1019
+ conditions.append(f"r.fare <= {max_price}")
1020
+
1021
+ range_query += " AND ".join(conditions)
1022
+ range_query += " RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare"
1023
+
1024
+ # Execute query
1025
+ with neo4j_service.driver.session() as session:
1026
+ result = session.run(range_query)
1027
+ results = [dict(record) for record in result]
1028
+
1029
+ return jsonify({
1030
+ 'success': True,
1031
+ 'data': results,
1032
+ 'message': f'Found {len(results)} routes in the specified range'
1033
+ })
1034
+
1035
+ except Exception as e:
1036
+ return jsonify({
1037
+ 'success': False,
1038
+ 'message': f'Error searching by range: {str(e)}'
1039
+ })
1040
+
1041
+ @app.route('/api/nlp/test-all-types')
1042
+ def test_all_query_types():
1043
+ """Test all query types with live results from Neo4j database"""
1044
+ try:
1045
+ # Define test queries for each type
1046
+ test_queries = {
1047
+ 'fare_inquiry': [
1048
+ 'What is the fare from Colombo to Kandy?',
1049
+ 'fare of anuradhapura to kandy',
1050
+ 'price from panadura to galle'
1051
+ ],
1052
+ 'comparison': [
1053
+ 'Compare fares from Colombo to Kandy vs Colombo to Galle',
1054
+ 'Which is cheaper between Colombo to Kandy and Colombo to Anuradapura?'
1055
+ ],
1056
+ 'range_search': [
1057
+ 'Find routes under 500 rupees',
1058
+ 'Show me routes between 200 and 800 rupees',
1059
+ 'Routes over 1000 rupees'
1060
+ ],
1061
+ 'recommendation': [
1062
+ 'Recommend cheap routes',
1063
+ 'Show me popular destinations',
1064
+ 'What are the best routes from Colombo?'
1065
+ ],
1066
+ 'route_inquiry': [
1067
+ 'Routes from Colombo',
1068
+ 'Routes to Galle',
1069
+ 'What routes depart from Kandy?'
1070
+ ],
1071
+ 'statistics': [
1072
+ 'What is the average fare?',
1073
+ 'Database statistics',
1074
+ 'How many routes are there?'
1075
+ ]
1076
+ }
1077
+
1078
+ results = {}
1079
+
1080
+ for query_type, queries in test_queries.items():
1081
+ type_results = []
1082
+ for query in queries:
1083
+ try:
1084
+ # Process with enhanced NLP (uses LLM for Cypher generation)
1085
+ result = enhanced_nlp_processor.process_query(query)
1086
+ type_results.append({
1087
+ 'query': query,
1088
+ 'result': result,
1089
+ 'success': result.get('success', False)
1090
+ })
1091
+ except Exception as e:
1092
+ type_results.append({
1093
+ 'query': query,
1094
+ 'result': {
1095
+ 'success': False,
1096
+ 'message': f'Error processing query: {str(e)}'
1097
+ },
1098
+ 'success': False
1099
+ })
1100
+
1101
+ results[query_type] = {
1102
+ 'description': f'Test results for {query_type} queries',
1103
+ 'total_queries': len(queries),
1104
+ 'successful_queries': sum(1 for r in type_results if r['success']),
1105
+ 'examples': type_results
1106
+ }
1107
+
1108
+ # Summary statistics
1109
+ total_queries = sum(len(queries) for queries in test_queries.values())
1110
+ total_successful = sum(
1111
+ results[query_type]['successful_queries']
1112
+ for query_type in results
1113
+ )
1114
+
1115
+ return jsonify({
1116
+ 'success': True,
1117
+ 'message': f'Tested {total_queries} queries across {len(test_queries)} types. {total_successful} successful.',
1118
+ 'summary': {
1119
+ 'total_query_types': len(test_queries),
1120
+ 'total_queries_tested': total_queries,
1121
+ 'successful_queries': total_successful,
1122
+ 'success_rate': round((total_successful / total_queries) * 100, 2) if total_queries > 0 else 0
1123
+ },
1124
+ 'results': results,
1125
+ 'neo4j_connected': neo4j_service.is_connected()
1126
+ })
1127
+
1128
+ except Exception as e:
1129
+ return jsonify({
1130
+ 'success': False,
1131
+ 'message': f'Error testing query types: {str(e)}',
1132
+ 'neo4j_connected': neo4j_service.is_connected()
1133
+ })
1134
+
1135
+ @app.errorhandler(404)
1136
+ def not_found(error):
1137
+ return jsonify({
1138
+ 'success': False,
1139
+ 'message': 'Endpoint not found'
1140
+ }), 404
1141
+
1142
+ @app.errorhandler(500)
1143
+ def internal_error(error):
1144
+ return jsonify({
1145
+ 'success': False,
1146
+ 'message': 'Internal server error'
1147
+ }), 500
1148
+
1149
+ if __name__ == '__main__':
1150
+ port = int(os.getenv('PORT', 7860)) # Hugging Face Spaces uses port 7860 by default
1151
+
1152
+ print("🚌 Natural Language Transport Query System")
1153
+ print("=" * 60)
1154
+ print(f"🚀 Starting on port {port}")
1155
+ print(f"🌐 Open your browser and go to: http://localhost:{port}")
1156
+
1157
+ # Check Neo4j connection
1158
+ if neo4j_service.is_connected():
1159
+ print("✅ Connected to Neo4j database")
1160
+ stats = neo4j_service.get_route_statistics()
1161
+ if stats:
1162
+ print(f"📊 Database: {stats.get('total_places', 0)} places, {stats.get('total_routes', 0)} routes")
1163
+ else:
1164
+ print("⚠️ Neo4j not connected - some features may not work")
1165
+
1166
+ # Check LLM availability
1167
+ if spell_corrector.llm_available:
1168
+ print("🤖 LLM integration available for spell correction")
1169
+ else:
1170
+ print("⚠️ LLM not available - using fuzzy matching only")
1171
+
1172
+ print("\n🎯 Enhanced Natural Language Capabilities:")
1173
+ print(" • Multiple query formats (fare, price, cost)")
1174
+ print(" • Natural language patterns (from X to Y, X to Y fare)")
1175
+ print(" • Question formats (What is, How much, Show me)")
1176
+ print(" • Compact formats (Colombo to Kandy fare)")
1177
+ print(" • Spell correction and fuzzy matching")
1178
+ print(" • LLM-powered query interpretation")
1179
+ print(" • Automatic Cypher query generation")
1180
+ print(" • Advanced intent classification")
1181
+ print(" • Entity extraction and normalization")
1182
+ print(" • Comparison queries (vs, versus, compare)")
1183
+ print(" • Range search queries (under, over, between)")
1184
+ print(" • Recommendation queries (recommend, suggest)")
1185
+ print(" • Confidence scoring for query understanding")
1186
+ print(" • Multi-language support: Sinhala, Tamil, Singlish, English")
1187
+ print(" • Automatic language detection and translation")
1188
+ print(" • Dictionary-based, LLM, and API translation methods")
1189
+ print(" • Response translation back to detected language")
1190
+
1191
+ print("\n🔗 Available API Endpoints:")
1192
+ print(" • /api/query - Process natural language queries (enhanced NLP)")
1193
+ print(" • /api/nlp/capabilities - View enhanced NLP capabilities with live examples")
1194
+ print(" • /api/nlp/test-all-types - Test all query types with live results")
1195
+ print(" • /api/nlp/test - Test queries with detailed analysis")
1196
+ print(" • /api/nlp/demo - Get comprehensive demo queries")
1197
+ print(" • /api/examples - Get categorized example queries")
1198
+ print(" • /api/sinhala/examples - Get Sinhala example queries")
1199
+ print(" • /api/tamil/examples - Get Tamil example queries")
1200
+ print(" • /api/language/detect - Detect language of input text")
1201
+ print(" • /api/translation/test - Test translation functionality")
1202
+ print(" • /api/translation/translate - Translate text between languages")
1203
+ print(" • /api/status - System status and statistics")
1204
+ print(" • /api/suggestions - Get location suggestions")
1205
+ print(" • /api/places - Get all available places")
1206
+
1207
+ print("=" * 60)
1208
+
1209
+ try:
1210
+ app.run(debug=False, port=port, host='0.0.0.0') # Set debug=False for production
1211
+ except Exception as e:
1212
+ print(f"❌ Error starting application: {e}")
1213
+ print("💡 Try running as administrator or check if another application is using the port")
1214
+
1215
+
enhanced_nlp_processor.py CHANGED
@@ -1,904 +1,904 @@
1
- #!/usr/bin/env python3
2
- """
3
- Enhanced NLP Processor for Transport Query Application
4
- Advanced natural language understanding and query processing
5
- """
6
-
7
- import re
8
- import json
9
- from typing import Dict, List, Tuple, Optional, Any
10
- from datetime import datetime
11
- from spell_corrector import SpellCorrector
12
- from neo4j_service import Neo4jService
13
- from config import Config
14
- from logger import get_logger
15
-
16
- class EnhancedNLPProcessor:
17
- """Advanced NLP processor with sophisticated query understanding"""
18
-
19
- def __init__(self):
20
- self.config = Config()
21
- self.spell_corrector = SpellCorrector()
22
- self.neo4j_service = Neo4jService()
23
- self.logger = get_logger(self.__class__.__name__)
24
-
25
- # Query patterns and templates
26
- self.query_patterns = {
27
- 'fare_queries': [
28
- r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
29
- r'(?:what\s+is\s+)?(?:the\s+)?(?:bus\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
30
- r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:bus\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
31
- r'([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:fare|price|cost)',
32
- r'(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
33
- r'(?:travel|transport)\s+(?:cost|price|fare)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
34
- r'(?:bus|train)\s+(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
35
- r'(?:ticket\s+price|ticket\s+fare)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
36
- ],
37
- 'comparison_queries': [
38
- r'(?:compare|difference)\s+(?:between\s+)?(?:fares?|prices?|costs?)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
39
- r'(?:which\s+is\s+)?(?:cheaper|more\s+expensive)\s+(?:between\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
40
- ],
41
- 'range_queries': [
42
- r'(?:routes?|fares?|prices?)\s+(?:between|from)\s+([0-9,]+)\s+(?:and|to)\s+([0-9,]+)\s+(?:rupees?|rs?)',
43
- r'(?:find|show)\s+(?:routes?|fares?|prices?)\s+(?:under|below|less\s+than)\s+([0-9,]+)\s+(?:rupees?|rs?)',
44
- r'(?:find|show)\s+(?:routes?|fares?|prices?)\s+(?:over|above|more\s+than)\s+([0-9,]+)\s+(?:rupees?|rs?)'
45
- ],
46
- 'route_queries': [
47
- r'(?:routes?|buses?|trains?)\s+(?:from|departing\s+from)\s+([a-zA-Z\s]+)',
48
- r'(?:routes?|buses?|trains?)\s+(?:to|arriving\s+at)\s+([a-zA-Z\s]+)',
49
- r'(?:how\s+many\s+)?(?:routes?|buses?|trains?)\s+(?:connect|go\s+to|from)\s+([a-zA-Z\s]+)',
50
- r'(?:direct|non-stop)\s+(?:routes?|buses?|trains?)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
51
- ],
52
- 'statistical_queries': [
53
- r'(?:average|mean|median)\s+(?:fare|price|cost)',
54
- r'(?:total|sum)\s+(?:of\s+)?(?:all\s+)?(?:fares?|prices?|costs?)',
55
- r'(?:how\s+many\s+)?(?:routes?|places?|locations?)',
56
- r'(?:database|system)\s+(?:statistics?|stats?|overview)',
57
- r'(?:summary|overview)\s+(?:of\s+)?(?:transport|fare)\s+(?:data|database)'
58
- ],
59
- 'recommendation_queries': [
60
- r'(?:recommend|suggest)\s+(?:cheap|budget|affordable)\s+(?:routes?|options?)',
61
- r'(?:best|optimal)\s+(?:route|way)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
62
- r'(?:popular|frequent)\s+(?:routes?|destinations?)',
63
- r'(?:hidden|secret|unknown)\s+(?:routes?|destinations?)'
64
- ]
65
- }
66
-
67
- # Query intent classification
68
- self.intent_keywords = {
69
- 'fare_inquiry': ['fare', 'price', 'cost', 'how much', 'what is the cost'],
70
- 'route_inquiry': ['route', 'bus', 'train', 'transport', 'how to get', 'way to'],
71
- 'comparison': ['compare', 'difference', 'vs', 'versus', 'which is', 'better'],
72
- 'statistics': ['statistics', 'stats', 'overview', 'summary', 'total', 'average'],
73
- 'recommendation': ['recommend', 'suggest', 'best', 'optimal', 'popular'],
74
- 'range_search': ['between', 'under', 'over', 'above', 'below', 'range'],
75
- 'availability': ['available', 'exist', 'have', 'is there', 'can i']
76
- }
77
-
78
- def process_query(self, user_query: str) -> Dict[str, Any]:
79
- """
80
- Process natural language query with advanced NLP understanding
81
-
82
- Args:
83
- user_query: Natural language query string
84
-
85
- Returns:
86
- Dictionary with comprehensive query analysis and results
87
- """
88
- try:
89
- # Step 1: Preprocess query
90
- processed_query = self._preprocess_query(user_query)
91
- self.logger.info(f"Processing query: original='{user_query}', preprocessed='{processed_query}'")
92
-
93
- # Step 2: Extract entities and intent
94
- entities = self._extract_entities(processed_query)
95
- intent = self._classify_intent(processed_query, entities)
96
-
97
- # Step 3: Generate Cypher query
98
- cypher_query = self._generate_cypher_query(intent, entities, processed_query)
99
- self.logger.debug(f"Intent: {intent}; Entities: {entities}; Cypher: {str(cypher_query).strip()[:200]}")
100
-
101
-
102
-
103
- # Step 4: Execute query and format results
104
- if cypher_query:
105
- results = self._execute_query(cypher_query)
106
- self.logger.info(f"Query results count: {len(results)}")
107
- response = self._format_response(intent, entities, results, processed_query)
108
- else:
109
- response = self._handle_unclear_query(processed_query)
110
-
111
- # Step 5: Add metadata
112
- response.update({
113
- 'query_analysis': {
114
- 'original_query': user_query,
115
- 'processed_query': processed_query,
116
- 'intent': intent,
117
- 'entities': entities,
118
- 'confidence': self._calculate_confidence(intent, entities)
119
- }
120
- })
121
-
122
- return response
123
-
124
- except Exception as e:
125
- return {
126
- 'success': False,
127
- 'message': f'Error processing query: {str(e)}',
128
- 'suggestions': self._get_suggestions()
129
- }
130
-
131
- def _preprocess_query(self, query: str) -> str:
132
- """Preprocess and normalize the query"""
133
- # Convert to lowercase
134
- query = query.lower().strip()
135
-
136
- # Remove extra whitespace
137
- query = re.sub(r'\s+', ' ', query)
138
-
139
- # Normalize common variations
140
- replacements = {
141
- 'rs.': 'rupees',
142
- 'rs': 'rupees',
143
- 'lkr': 'rupees',
144
- '→': 'to',
145
- '->': 'to',
146
- 'vs': 'versus',
147
- '&': 'and',
148
- 'w/': 'with',
149
- 'w/o': 'without'
150
- }
151
-
152
- for old, new in replacements.items():
153
- query = query.replace(old, new)
154
-
155
- return query
156
-
157
- def _extract_entities(self, query: str) -> Dict[str, Any]:
158
- """Extract entities from the query"""
159
- entities = {
160
- 'locations': [],
161
- 'numbers': [],
162
- 'currencies': [],
163
- 'comparators': [],
164
- 'time_expressions': []
165
- }
166
-
167
- # Extract locations with priority for different query types
168
- comparison_patterns = [
169
- r'(?:which\s+is\s+)?(?:cheaper|more\s+expensive)\s+(?:between\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
170
- r'(?:what\s+is\s+)?(?:the\s+)?(?:difference|compare)\s+(?:in\s+)?(?:fare|price|cost)\s+(?:between\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
171
- r'(?:compare|difference)\s+(?:between\s+)?(?:fares?|prices?|costs?)\s+(?:from\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
172
- # Simpler patterns for comparison
173
- r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
174
- r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
175
- ]
176
-
177
- fare_patterns = [
178
- r'(?:fare|price|cost)\s+(?:of|from)?\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
179
- r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
180
- r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
181
- ]
182
-
183
- general_patterns = [
184
- r'from\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
185
- r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
186
- r'between\s+([a-zA-Z\s]+?)\s+and\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
187
- ]
188
-
189
- # Use a set to avoid duplicates
190
- seen_locations = set()
191
-
192
- # Try comparison patterns first (highest priority)
193
- for pattern in comparison_patterns:
194
- matches = re.finditer(pattern, query, re.IGNORECASE)
195
- for match in matches:
196
- locations = [loc.strip() for loc in match.groups() if loc.strip()]
197
- for loc in locations:
198
- # Skip if we've already processed this location
199
- if loc.lower() in seen_locations:
200
- continue
201
- seen_locations.add(loc.lower())
202
-
203
- corrected, confidence, method = self.spell_corrector.correct_location(loc)
204
- if confidence > 0.5:
205
- entities['locations'].append({
206
- 'original': loc,
207
- 'corrected': corrected,
208
- 'confidence': confidence,
209
- 'method': method
210
- })
211
-
212
- # If no locations found with comparison patterns, try fare patterns
213
- if not entities['locations']:
214
- for pattern in fare_patterns:
215
- matches = re.finditer(pattern, query, re.IGNORECASE)
216
- for match in matches:
217
- locations = [loc.strip() for loc in match.groups() if loc.strip()]
218
- for loc in locations:
219
- # Skip if we've already processed this location
220
- if loc.lower() in seen_locations:
221
- continue
222
- seen_locations.add(loc.lower())
223
-
224
- corrected, confidence, method = self.spell_corrector.correct_location(loc)
225
- if confidence > 0.5:
226
- entities['locations'].append({
227
- 'original': loc,
228
- 'corrected': corrected,
229
- 'confidence': confidence,
230
- 'method': method
231
- })
232
-
233
- # If no locations found with fare patterns, try general patterns
234
- if not entities['locations']:
235
- for pattern in general_patterns:
236
- matches = re.finditer(pattern, query, re.IGNORECASE)
237
- for match in matches:
238
- locations = [loc.strip() for loc in match.groups() if loc.strip()]
239
- for loc in locations:
240
- # Skip if we've already processed this location
241
- if loc.lower() in seen_locations:
242
- continue
243
- seen_locations.add(loc.lower())
244
-
245
- corrected, confidence, method = self.spell_corrector.correct_location(loc)
246
- if confidence > 0.5:
247
- entities['locations'].append({
248
- 'original': loc,
249
- 'corrected': corrected,
250
- 'confidence': confidence,
251
- 'method': method
252
- })
253
-
254
-
255
-
256
-
257
-
258
- # Extract numbers and currencies
259
- number_patterns = [
260
- r'(under|below|less\s+than|over|above|more\s+than)\s+(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?',
261
- r'between\s+(\d+(?:,\d+)*(?:\.\d+)?)\s+and\s+(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?',
262
- r'(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?'
263
- ]
264
-
265
- for pattern in number_patterns:
266
- matches = re.finditer(pattern, query, re.IGNORECASE)
267
- for match in matches:
268
- groups = match.groups()
269
- if len(groups) >= 2:
270
- if groups[0] in ['under', 'below', 'less than', 'over', 'above', 'more than']:
271
- # Pattern: (under|below|less than|over|above|more than) (number) (currency)
272
- comparator = groups[0]
273
- number = groups[1]
274
- currency = groups[2] if len(groups) >= 3 else 'rupees'
275
-
276
- entities['numbers'].append({
277
- 'value': float(number.replace(',', '')),
278
- 'currency': currency,
279
- 'comparator': comparator
280
- })
281
- elif 'between' in pattern:
282
- # Pattern: between (number1) and (number2) (currency)
283
- min_number = groups[0]
284
- max_number = groups[1]
285
- currency = groups[2] if len(groups) >= 3 else 'rupees'
286
-
287
- entities['numbers'].append({
288
- 'value': float(min_number.replace(',', '')),
289
- 'currency': currency,
290
- 'comparator': 'between_min'
291
- })
292
- entities['numbers'].append({
293
- 'value': float(max_number.replace(',', '')),
294
- 'currency': currency,
295
- 'comparator': 'between_max'
296
- })
297
- else:
298
- # Pattern: (number) (currency)
299
- number = groups[0]
300
- currency = groups[1] if len(groups) >= 2 else 'rupees'
301
-
302
- entities['numbers'].append({
303
- 'value': float(number.replace(',', '')),
304
- 'currency': currency,
305
- 'comparator': None
306
- })
307
-
308
- # Extract comparators
309
- comparator_patterns = [
310
- r'(cheaper|more\s+expensive|better|worse|faster|slower)',
311
- r'(compare|difference|vs|versus)',
312
- r'(under|below|less\s+than|over|above|more\s+than)'
313
- ]
314
-
315
- for pattern in comparator_patterns:
316
- matches = re.finditer(pattern, query, re.IGNORECASE)
317
- for match in matches:
318
- entities['comparators'].append(match.group(1).lower())
319
-
320
- return entities
321
-
322
- def _classify_intent(self, query: str, entities: Dict = None) -> Dict[str, Any]:
323
- """Classify the intent of the query"""
324
- intent_scores = {}
325
-
326
- for intent, keywords in self.intent_keywords.items():
327
- score = 0
328
- for keyword in keywords:
329
- if keyword in query:
330
- score += 1
331
- intent_scores[intent] = score
332
-
333
- # Get primary intent
334
- primary_intent = max(intent_scores.items(), key=lambda x: x[1])
335
-
336
- # Check for specific patterns with priority
337
- if any(pattern in query for pattern in ['compare', 'difference', 'vs', 'versus', 'cheaper', 'more expensive']):
338
- primary_intent = ('comparison', 10)
339
- elif any(pattern in query for pattern in ['recommend', 'suggest', 'best', 'optimal', 'popular']):
340
- primary_intent = ('recommendation', 10)
341
- elif any(pattern in query for pattern in ['between', 'under', 'over', 'above', 'below', 'range']):
342
- primary_intent = ('range_search', 10)
343
- elif any(pattern in query for pattern in ['fare', 'price', 'cost', 'how much']):
344
- # Check if we have at least 2 locations
345
- if entities and len(entities.get('locations', [])) >= 2:
346
- primary_intent = ('fare_inquiry', 10)
347
- elif any(pattern in query for pattern in ['route', 'bus', 'train', 'transport']):
348
- primary_intent = ('route_inquiry', 10)
349
-
350
- return {
351
- 'primary': primary_intent[0],
352
- 'confidence': primary_intent[1] / 10,
353
- 'all_scores': intent_scores
354
- }
355
-
356
- def _generate_cypher_query(self, intent: Dict, entities: Dict, query: str) -> Optional[str]:
357
- """Generate Cypher query using LLM for better understanding"""
358
- try:
359
- # Try LLM-based query generation first
360
- llm_query = self._generate_cypher_with_llm(query, intent, entities)
361
- if llm_query:
362
- return llm_query
363
- except Exception as e:
364
- print(f"LLM query generation failed: {e}")
365
-
366
- # Fallback to rule-based generation
367
- primary_intent = intent['primary']
368
-
369
- if primary_intent == 'fare_inquiry':
370
- return self._generate_fare_query(entities)
371
- elif primary_intent == 'comparison':
372
- return self._generate_comparison_query(entities)
373
- elif primary_intent == 'route_inquiry':
374
- return self._generate_route_query(entities, query)
375
- elif primary_intent == 'statistics':
376
- return self._generate_statistics_query(entities)
377
- elif primary_intent == 'recommendation':
378
- return self._generate_recommendation_query(entities, query)
379
- elif primary_intent == 'range_search':
380
- return self._generate_range_query(entities)
381
- else:
382
- return self._generate_fallback_query(query)
383
-
384
- def _generate_fare_query(self, entities: Dict) -> Optional[str]:
385
- """Generate fare inquiry Cypher query"""
386
- locations = entities.get('locations', [])
387
-
388
- if len(locations) >= 2:
389
- from_loc = locations[0]['corrected']
390
- to_loc = locations[1]['corrected']
391
-
392
- return f"""
393
- MATCH (a:Place {{name: '{from_loc}'}})-[r:Fare]->(b:Place {{name: '{to_loc}'}})
394
- RETURN
395
- a.name as from_place,
396
- b.name as to_place,
397
- r.fare as fare,
398
- 'Direct route' as route_type
399
- """
400
-
401
- return None
402
-
403
- def _generate_comparison_query(self, entities: Dict) -> Optional[str]:
404
- """Generate comparison Cypher query"""
405
- locations = entities.get('locations', [])
406
-
407
- if len(locations) >= 3:
408
- # Handle case where we have same origin, different destinations
409
- if len(locations) == 3:
410
- # Pattern: "Colombo to Kandy and Colombo to Anuradapura"
411
- route1_from = locations[0]['corrected']
412
- route1_to = locations[1]['corrected']
413
- route2_from = locations[0]['corrected'] # Same origin
414
- route2_to = locations[2]['corrected']
415
- elif len(locations) >= 4:
416
- # Pattern: "Colombo to Kandy and Anuradapura to Galle"
417
- route1_from = locations[0]['corrected']
418
- route1_to = locations[1]['corrected']
419
- route2_from = locations[2]['corrected']
420
- route2_to = locations[3]['corrected']
421
- else:
422
- return None
423
-
424
- return f"""
425
- MATCH (a1:Place {{name: '{route1_from}'}})-[r1:Fare]->(b1:Place {{name: '{route1_to}'}})
426
- MATCH (a2:Place {{name: '{route2_from}'}})-[r2:Fare]->(b2:Place {{name: '{route2_to}'}})
427
- RETURN
428
- a1.name + ' to ' + b1.name as route1,
429
- r1.fare as fare1,
430
- a2.name + ' to ' + b2.name as route2,
431
- r2.fare as fare2,
432
- r1.fare - r2.fare as difference,
433
- CASE
434
- WHEN r1.fare < r2.fare THEN 'Route 1 is cheaper'
435
- WHEN r1.fare > r2.fare THEN 'Route 2 is cheaper'
436
- ELSE 'Both routes have the same fare'
437
- END as comparison
438
- """
439
-
440
- return None
441
-
442
- def _generate_route_query(self, entities: Dict, query: str) -> Optional[str]:
443
- """Generate route inquiry Cypher query"""
444
- locations = entities.get('locations', [])
445
-
446
- if 'from' in query and locations:
447
- location = locations[0]['corrected']
448
- return f"""
449
- MATCH (a:Place {{name: '{location}'}})-[r:Fare]->(b:Place)
450
- RETURN
451
- a.name as from_place,
452
- b.name as to_place,
453
- r.fare as fare
454
- ORDER BY r.fare
455
- """
456
- elif 'to' in query and locations:
457
- location = locations[0]['corrected']
458
- return f"""
459
- MATCH (a:Place)-[r:Fare]->(b:Place {{name: '{location}'}})
460
- RETURN
461
- a.name as from_place,
462
- b.name as to_place,
463
- r.fare as fare
464
- ORDER BY r.fare
465
- """
466
-
467
- return None
468
-
469
- def _generate_statistics_query(self, entities: Dict) -> str:
470
- """Generate statistics Cypher query"""
471
- return """
472
- MATCH (p:Place)
473
- MATCH ()-[r:Fare]->()
474
- RETURN
475
- count(DISTINCT p) as total_places,
476
- count(r) as total_routes,
477
- round(avg(r.fare), 2) as average_fare,
478
- min(r.fare) as minimum_fare,
479
- max(r.fare) as maximum_fare,
480
- round(stdDev(r.fare), 2) as fare_standard_deviation
481
- """
482
-
483
- def _generate_recommendation_query(self, entities: Dict, query: str) -> str:
484
- """Generate recommendation Cypher query"""
485
- if 'cheap' in query or 'budget' in query or 'affordable' in query:
486
- return """
487
- MATCH (a:Place)-[r:Fare]->(b:Place)
488
- RETURN
489
- a.name as from_place,
490
- b.name as to_place,
491
- r.fare as fare
492
- ORDER BY r.fare ASC
493
- LIMIT 10
494
- """
495
- elif 'popular' in query or 'frequent' in query:
496
- return """
497
- MATCH (a:Place)-[r:Fare]->(b:Place)
498
- RETURN
499
- a.name as from_place,
500
- b.name as to_place,
501
- r.fare as fare
502
- ORDER BY r.fare DESC
503
- LIMIT 10
504
- """
505
- else:
506
- return """
507
- MATCH (a:Place)-[r:Fare]->(b:Place)
508
- RETURN
509
- a.name as from_place,
510
- b.name as to_place,
511
- r.fare as fare
512
- ORDER BY r.fare ASC
513
- LIMIT 5
514
- """
515
-
516
- def _generate_range_query(self, entities: Dict) -> Optional[str]:
517
- """Generate range search Cypher query"""
518
- numbers = entities.get('numbers', [])
519
-
520
- if numbers:
521
- # Check for between range
522
- between_min = None
523
- between_max = None
524
- single_value = None
525
- single_comparator = None
526
-
527
- for number in numbers:
528
- comparator = number.get('comparator', '')
529
- value = number['value']
530
-
531
- if comparator == 'between_min':
532
- between_min = value
533
- elif comparator == 'between_max':
534
- between_max = value
535
- elif comparator in ['under', 'below', 'less than', 'over', 'above', 'more than']:
536
- single_value = value
537
- single_comparator = comparator
538
-
539
- # Generate query based on type
540
- if between_min is not None and between_max is not None:
541
- return f"""
542
- MATCH (a:Place)-[r:Fare]->(b:Place)
543
- WHERE r.fare >= {between_min} AND r.fare <= {between_max}
544
- RETURN
545
- a.name as from_place,
546
- b.name as to_place,
547
- r.fare as fare
548
- ORDER BY r.fare ASC
549
- """
550
- elif single_value is not None and single_comparator is not None:
551
- if single_comparator in ['under', 'below', 'less than']:
552
- return f"""
553
- MATCH (a:Place)-[r:Fare]->(b:Place)
554
- WHERE r.fare < {single_value}
555
- RETURN
556
- a.name as from_place,
557
- b.name as to_place,
558
- r.fare as fare
559
- ORDER BY r.fare ASC
560
- """
561
- elif single_comparator in ['over', 'above', 'more than']:
562
- return f"""
563
- MATCH (a:Place)-[r:Fare]->(b:Place)
564
- WHERE r.fare > {single_value}
565
- RETURN
566
- a.name as from_place,
567
- b.name as to_place,
568
- r.fare as fare
569
- ORDER BY r.fare DESC
570
- """
571
-
572
- return None
573
-
574
- def _generate_cypher_with_llm(self, query: str, intent: Dict, entities: Dict) -> Optional[str]:
575
- """Generate Cypher query using LLM for better understanding"""
576
- try:
577
- if not self.config.OPENAI_API_KEY:
578
- return None
579
-
580
- # Get available places for context
581
- available_places = list(self.neo4j_service.get_all_places())
582
-
583
- # Create comprehensive prompt for Cypher generation
584
- prompt = f"""
585
- You are a Neo4j Cypher query generator for a transport database.
586
-
587
- Database Schema:
588
- - Nodes: Place (with property 'name')
589
- - Relationships: Fare (with property 'fare')
590
-
591
- Available Places: {', '.join(available_places[:50])}... (total: {len(available_places)})
592
-
593
- User Query: "{query}"
594
- Detected Intent: {intent.get('primary', 'unknown')}
595
- Extracted Entities: {entities}
596
-
597
- Your task is to generate a valid Cypher query that answers the user's question.
598
-
599
- Query Types and Examples:
600
-
601
- 1. FARE INQUIRY:
602
- - "What is the fare from Colombo to Kandy?"
603
- - Cypher: MATCH (a:Place {{name: 'Colombo'}})-[r:Fare]->(b:Place {{name: 'Kandy'}}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
604
-
605
- 2. COMPARISON:
606
- - "Compare fares from Colombo to Kandy vs Colombo to Galle"
607
- - Cypher: MATCH (a1:Place {{name: 'Colombo'}})-[r1:Fare]->(b1:Place {{name: 'Kandy'}}) MATCH (a2:Place {{name: 'Colombo'}})-[r2:Fare]->(b2:Place {{name: 'Galle'}}) RETURN a1.name + ' to ' + b1.name as route1, r1.fare as fare1, a2.name + ' to ' + b2.name as route2, r2.fare as fare2, r1.fare - r2.fare as difference
608
-
609
- 3. RANGE SEARCH:
610
- - "Find routes under 500 rupees"
611
- - Cypher: MATCH (a:Place)-[r:Fare]->(b:Place) WHERE r.fare < 500 RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC
612
-
613
- 4. RECOMMENDATION:
614
- - "Recommend cheap routes"
615
- - Cypher: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 10
616
-
617
- 5. STATISTICS:
618
- - "What is the average fare?"
619
- - Cypher: MATCH ()-[r:Fare]->() RETURN round(avg(r.fare), 2) as average_fare, min(r.fare) as min_fare, max(r.fare) as max_fare
620
-
621
- 6. ROUTE INQUIRY:
622
- - "Routes from Colombo"
623
- - Cypher: MATCH (a:Place {{name: 'Colombo'}})-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare
624
-
625
- Important Rules:
626
- 1. Always use proper Cypher syntax
627
- 2. Use exact place names from the available places list
628
- 3. For comparisons, use multiple MATCH clauses
629
- 4. For ranges, use WHERE clauses with appropriate operators
630
- 5. For statistics, use aggregation functions
631
- 6. Always include meaningful column aliases
632
- 7. Use ORDER BY for sorted results
633
- 8. Use LIMIT for large result sets
634
-
635
- Return ONLY the Cypher query, nothing else. If you cannot generate a valid query, return "FALLBACK".
636
- """
637
-
638
- cypher_query = None
639
- # Prefer new SDK
640
- try:
641
- from openai import OpenAI
642
- client = OpenAI(api_key=self.config.OPENAI_API_KEY)
643
- response = client.chat.completions.create(
644
- model=self.config.OPENAI_MODEL,
645
- messages=[
646
- {"role": "system", "content": "You are a Cypher query generator. Return only valid Cypher queries."},
647
- {"role": "user", "content": prompt}
648
- ],
649
- max_tokens=300,
650
- temperature=0.1
651
- )
652
- cypher_query = response.choices[0].message.content.strip()
653
- except Exception as sdk_err:
654
- import openai
655
- try:
656
- openai.api_key = self.config.OPENAI_API_KEY
657
- response = openai.ChatCompletion.create(
658
- model=self.config.OPENAI_MODEL,
659
- messages=[
660
- {"role": "system", "content": "You are a Cypher query generator. Return only valid Cypher queries."},
661
- {"role": "user", "content": prompt}
662
- ],
663
- max_tokens=300,
664
- temperature=0.1
665
- )
666
- cypher_query = response.choices[0].message.content.strip()
667
- except Exception:
668
- raise sdk_err
669
-
670
- # Validate the response
671
- if cypher_query.upper() == "FALLBACK":
672
- return None
673
-
674
- # Basic validation - check if it starts with MATCH
675
- if cypher_query.upper().startswith('MATCH'):
676
- return cypher_query
677
-
678
- return None
679
-
680
- except Exception as e:
681
- print(f"LLM Cypher generation error: {e}")
682
- return None
683
-
684
- def _generate_fallback_query(self, query: str) -> Optional[str]:
685
- """Generate fallback query when intent is unclear"""
686
- # Try to extract locations using spell corrector
687
- locations = self.spell_corrector.extract_locations_from_query(query)
688
-
689
- if len(locations) >= 2:
690
- from_loc = locations[0][1]
691
- to_loc = locations[1][1]
692
- return f"""
693
- MATCH (a:Place {{name: '{from_loc}'}})-[r:Fare]->(b:Place {{name: '{to_loc}'}})
694
- RETURN
695
- a.name as from_place,
696
- b.name as to_place,
697
- r.fare as fare
698
- """
699
-
700
- # Additional fallback: direct pattern matching for fare queries
701
- if 'fare' in query.lower() or 'price' in query.lower() or 'cost' in query.lower():
702
- import re
703
- fare_patterns = [
704
- r'fare\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
705
- r'price\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
706
- r'cost\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
707
- r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
708
- r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)'
709
- ]
710
-
711
- for pattern in fare_patterns:
712
- match = re.search(pattern, query.lower())
713
- if match:
714
- from_loc = match.group(1).strip()
715
- to_loc = match.group(2).strip()
716
-
717
- # Correct locations
718
- from_corrected, from_conf, _ = self.spell_corrector.correct_location(from_loc)
719
- to_corrected, to_conf, _ = self.spell_corrector.correct_location(to_loc)
720
-
721
- if from_conf > 0.5 and to_conf > 0.5:
722
- return f"""
723
- MATCH (a:Place {{name: '{from_corrected}'}})-[r:Fare]->(b:Place {{name: '{to_corrected}'}})
724
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
725
- """
726
-
727
- return None
728
-
729
- def _execute_query(self, cypher_query: str) -> List[Dict]:
730
- """Execute Cypher query and return results"""
731
- try:
732
- with self.neo4j_service.driver.session() as session:
733
- result = session.run(cypher_query)
734
- return [dict(record) for record in result]
735
- except Exception as e:
736
- print(f"Query execution error: {e}")
737
- return []
738
-
739
- def _format_response(self, intent: Dict, entities: Dict, results: List[Dict], query: str) -> Dict[str, Any]:
740
- """Format the response based on intent and results"""
741
- primary_intent = intent['primary']
742
-
743
- if not results:
744
- return {
745
- 'success': False,
746
- 'message': 'No results found for your query.',
747
- 'suggestions': self._get_suggestions()
748
- }
749
-
750
- if primary_intent == 'fare_inquiry':
751
- return self._format_fare_response(results, entities)
752
- elif primary_intent == 'comparison':
753
- return self._format_comparison_response(results, entities)
754
- elif primary_intent == 'route_inquiry':
755
- return self._format_route_response(results, entities)
756
- elif primary_intent == 'statistics':
757
- return self._format_statistics_response(results)
758
- elif primary_intent == 'recommendation':
759
- return self._format_recommendation_response(results, query)
760
- elif primary_intent == 'range_search':
761
- return self._format_range_response(results, entities)
762
- else:
763
- return self._format_generic_response(results)
764
-
765
- def _format_fare_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
766
- """Format fare inquiry response"""
767
- if results:
768
- result = results[0]
769
- return {
770
- 'success': True,
771
- 'message': f"The fare from {result['from_place']} to {result['to_place']} is Rs. {result['fare']}",
772
- 'data': results,
773
- 'query_type': 'fare_inquiry',
774
- 'summary': {
775
- 'from_place': result['from_place'],
776
- 'to_place': result['to_place'],
777
- 'fare': result['fare']
778
- }
779
- }
780
- return {'success': False, 'message': 'Fare information not found.'}
781
-
782
- def _format_comparison_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
783
- """Format comparison response"""
784
- if results:
785
- result = results[0]
786
- return {
787
- 'success': True,
788
- 'message': result.get('comparison', 'Comparison completed'),
789
- 'data': results,
790
- 'query_type': 'comparison',
791
- 'summary': {
792
- 'route1': result.get('route1'),
793
- 'route2': result.get('route2'),
794
- 'difference': result.get('difference')
795
- }
796
- }
797
- return {'success': False, 'message': 'Comparison not possible.'}
798
-
799
- def _format_route_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
800
- """Format route inquiry response"""
801
- return {
802
- 'success': True,
803
- 'message': f"Found {len(results)} routes",
804
- 'data': results,
805
- 'query_type': 'route_inquiry',
806
- 'summary': {
807
- 'total_routes': len(results),
808
- 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
809
- }
810
- }
811
-
812
- def _format_statistics_response(self, results: List[Dict]) -> Dict[str, Any]:
813
- """Format statistics response"""
814
- if results:
815
- stats = results[0]
816
- return {
817
- 'success': True,
818
- 'message': f"Database contains {stats['total_places']} places and {stats['total_routes']} routes",
819
- 'data': results,
820
- 'query_type': 'statistics',
821
- 'summary': {
822
- 'total_places': stats['total_places'],
823
- 'total_routes': stats['total_routes'],
824
- 'average_fare': stats['average_fare'],
825
- 'fare_range': f"Rs. {stats['minimum_fare']} - Rs. {stats['maximum_fare']}"
826
- }
827
- }
828
- return {'success': False, 'message': 'Statistics not available.'}
829
-
830
- def _format_recommendation_response(self, results: List[Dict], query: str) -> Dict[str, Any]:
831
- """Format recommendation response"""
832
- return {
833
- 'success': True,
834
- 'message': f"Here are {len(results)} recommended routes",
835
- 'data': results,
836
- 'query_type': 'recommendation',
837
- 'summary': {
838
- 'recommendations_count': len(results),
839
- 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
840
- }
841
- }
842
-
843
- def _format_range_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
844
- """Format range search response"""
845
- return {
846
- 'success': True,
847
- 'message': f"Found {len(results)} routes in your specified range",
848
- 'data': results,
849
- 'query_type': 'range_search',
850
- 'summary': {
851
- 'routes_found': len(results),
852
- 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
853
- }
854
- }
855
-
856
- def _format_generic_response(self, results: List[Dict]) -> Dict[str, Any]:
857
- """Format generic response"""
858
- return {
859
- 'success': True,
860
- 'message': f"Found {len(results)} results",
861
- 'data': results,
862
- 'query_type': 'generic'
863
- }
864
-
865
- def _handle_unclear_query(self, query: str) -> Dict[str, Any]:
866
- """Handle unclear or ambiguous queries"""
867
- return {
868
- 'success': False,
869
- 'message': 'I could not understand your query. Please try rephrasing it.',
870
- 'suggestions': self._get_suggestions(),
871
- 'query_type': 'unclear'
872
- }
873
-
874
- def _calculate_confidence(self, intent: Dict, entities: Dict) -> float:
875
- """Calculate confidence score for the query interpretation"""
876
- confidence = 0.0
877
-
878
- # Intent confidence
879
- confidence += intent.get('confidence', 0) * 0.4
880
-
881
- # Entity confidence
882
- locations = entities.get('locations', [])
883
- if locations:
884
- avg_location_confidence = sum(loc['confidence'] for loc in locations) / len(locations)
885
- confidence += avg_location_confidence * 0.4
886
-
887
- # Query complexity bonus
888
- if len(locations) >= 2:
889
- confidence += 0.2
890
-
891
- return min(confidence, 1.0)
892
-
893
- def _get_suggestions(self) -> List[str]:
894
- """Get query suggestions"""
895
- return [
896
- "What is the fare from Colombo to Kandy?",
897
- "Compare fares from Colombo to Kandy vs Colombo to Galle",
898
- "Show me routes from Panadura",
899
- "Find routes under 500 rupees",
900
- "What are the cheapest routes?",
901
- "Show me popular destinations",
902
- "Give me database statistics",
903
- "Recommend affordable routes"
904
- ]
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced NLP Processor for Transport Query Application
4
+ Advanced natural language understanding and query processing
5
+ """
6
+
7
+ import re
8
+ import json
9
+ from typing import Dict, List, Tuple, Optional, Any
10
+ from datetime import datetime
11
+ from spell_corrector import SpellCorrector
12
+ from neo4j_service import Neo4jService
13
+ from config import Config
14
+ from logger import get_logger
15
+
16
+ class EnhancedNLPProcessor:
17
+ """Advanced NLP processor with sophisticated query understanding"""
18
+
19
+ def __init__(self):
20
+ self.config = Config()
21
+ self.spell_corrector = SpellCorrector()
22
+ self.neo4j_service = Neo4jService()
23
+ self.logger = get_logger(self.__class__.__name__)
24
+
25
+ # Query patterns and templates
26
+ self.query_patterns = {
27
+ 'fare_queries': [
28
+ r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
29
+ r'(?:what\s+is\s+)?(?:the\s+)?(?:bus\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
30
+ r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:bus\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
31
+ r'([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:fare|price|cost)',
32
+ r'(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
33
+ r'(?:travel|transport)\s+(?:cost|price|fare)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
34
+ r'(?:bus|train)\s+(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
35
+ r'(?:ticket\s+price|ticket\s+fare)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
36
+ ],
37
+ 'comparison_queries': [
38
+ r'(?:compare|difference)\s+(?:between\s+)?(?:fares?|prices?|costs?)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
39
+ r'(?:which\s+is\s+)?(?:cheaper|more\s+expensive)\s+(?:between\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
40
+ ],
41
+ 'range_queries': [
42
+ r'(?:routes?|fares?|prices?)\s+(?:between|from)\s+([0-9,]+)\s+(?:and|to)\s+([0-9,]+)\s+(?:rupees?|rs?)',
43
+ r'(?:find|show)\s+(?:routes?|fares?|prices?)\s+(?:under|below|less\s+than)\s+([0-9,]+)\s+(?:rupees?|rs?)',
44
+ r'(?:find|show)\s+(?:routes?|fares?|prices?)\s+(?:over|above|more\s+than)\s+([0-9,]+)\s+(?:rupees?|rs?)'
45
+ ],
46
+ 'route_queries': [
47
+ r'(?:routes?|buses?|trains?)\s+(?:from|departing\s+from)\s+([a-zA-Z\s]+)',
48
+ r'(?:routes?|buses?|trains?)\s+(?:to|arriving\s+at)\s+([a-zA-Z\s]+)',
49
+ r'(?:how\s+many\s+)?(?:routes?|buses?|trains?)\s+(?:connect|go\s+to|from)\s+([a-zA-Z\s]+)',
50
+ r'(?:direct|non-stop)\s+(?:routes?|buses?|trains?)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)'
51
+ ],
52
+ 'statistical_queries': [
53
+ r'(?:average|mean|median)\s+(?:fare|price|cost)',
54
+ r'(?:total|sum)\s+(?:of\s+)?(?:all\s+)?(?:fares?|prices?|costs?)',
55
+ r'(?:how\s+many\s+)?(?:routes?|places?|locations?)',
56
+ r'(?:database|system)\s+(?:statistics?|stats?|overview)',
57
+ r'(?:summary|overview)\s+(?:of\s+)?(?:transport|fare)\s+(?:data|database)'
58
+ ],
59
+ 'recommendation_queries': [
60
+ r'(?:recommend|suggest)\s+(?:cheap|budget|affordable)\s+(?:routes?|options?)',
61
+ r'(?:best|optimal)\s+(?:route|way)\s+(?:from\s+)?([a-zA-Z\s]+)\s+(?:to|→|->)\s+([a-zA-Z\s]+)',
62
+ r'(?:popular|frequent)\s+(?:routes?|destinations?)',
63
+ r'(?:hidden|secret|unknown)\s+(?:routes?|destinations?)'
64
+ ]
65
+ }
66
+
67
+ # Query intent classification
68
+ self.intent_keywords = {
69
+ 'fare_inquiry': ['fare', 'price', 'cost', 'how much', 'what is the cost'],
70
+ 'route_inquiry': ['route', 'bus', 'train', 'transport', 'how to get', 'way to'],
71
+ 'comparison': ['compare', 'difference', 'vs', 'versus', 'which is', 'better'],
72
+ 'statistics': ['statistics', 'stats', 'overview', 'summary', 'total', 'average'],
73
+ 'recommendation': ['recommend', 'suggest', 'best', 'optimal', 'popular'],
74
+ 'range_search': ['between', 'under', 'over', 'above', 'below', 'range'],
75
+ 'availability': ['available', 'exist', 'have', 'is there', 'can i']
76
+ }
77
+
78
+ def process_query(self, user_query: str) -> Dict[str, Any]:
79
+ """
80
+ Process natural language query with advanced NLP understanding
81
+
82
+ Args:
83
+ user_query: Natural language query string
84
+
85
+ Returns:
86
+ Dictionary with comprehensive query analysis and results
87
+ """
88
+ try:
89
+ # Step 1: Preprocess query
90
+ processed_query = self._preprocess_query(user_query)
91
+ self.logger.info(f"Processing query: original='{user_query}', preprocessed='{processed_query}'")
92
+
93
+ # Step 2: Extract entities and intent
94
+ entities = self._extract_entities(processed_query)
95
+ intent = self._classify_intent(processed_query, entities)
96
+
97
+ # Step 3: Generate Cypher query
98
+ cypher_query = self._generate_cypher_query(intent, entities, processed_query)
99
+ self.logger.debug(f"Intent: {intent}; Entities: {entities}; Cypher: {str(cypher_query).strip()[:200]}")
100
+
101
+
102
+
103
+ # Step 4: Execute query and format results
104
+ if cypher_query:
105
+ results = self._execute_query(cypher_query)
106
+ self.logger.info(f"Query results count: {len(results)}")
107
+ response = self._format_response(intent, entities, results, processed_query)
108
+ else:
109
+ response = self._handle_unclear_query(processed_query)
110
+
111
+ # Step 5: Add metadata
112
+ response.update({
113
+ 'query_analysis': {
114
+ 'original_query': user_query,
115
+ 'processed_query': processed_query,
116
+ 'intent': intent,
117
+ 'entities': entities,
118
+ 'confidence': self._calculate_confidence(intent, entities)
119
+ }
120
+ })
121
+
122
+ return response
123
+
124
+ except Exception as e:
125
+ return {
126
+ 'success': False,
127
+ 'message': f'Error processing query: {str(e)}',
128
+ 'suggestions': self._get_suggestions()
129
+ }
130
+
131
+ def _preprocess_query(self, query: str) -> str:
132
+ """Preprocess and normalize the query"""
133
+ # Convert to lowercase
134
+ query = query.lower().strip()
135
+
136
+ # Remove extra whitespace
137
+ query = re.sub(r'\s+', ' ', query)
138
+
139
+ # Normalize common variations
140
+ replacements = {
141
+ 'rs.': 'rupees',
142
+ 'rs': 'rupees',
143
+ 'lkr': 'rupees',
144
+ '→': 'to',
145
+ '->': 'to',
146
+ 'vs': 'versus',
147
+ '&': 'and',
148
+ 'w/': 'with',
149
+ 'w/o': 'without'
150
+ }
151
+
152
+ for old, new in replacements.items():
153
+ query = query.replace(old, new)
154
+
155
+ return query
156
+
157
+ def _extract_entities(self, query: str) -> Dict[str, Any]:
158
+ """Extract entities from the query"""
159
+ entities = {
160
+ 'locations': [],
161
+ 'numbers': [],
162
+ 'currencies': [],
163
+ 'comparators': [],
164
+ 'time_expressions': []
165
+ }
166
+
167
+ # Extract locations with priority for different query types
168
+ comparison_patterns = [
169
+ r'(?:which\s+is\s+)?(?:cheaper|more\s+expensive)\s+(?:between\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
170
+ r'(?:what\s+is\s+)?(?:the\s+)?(?:difference|compare)\s+(?:in\s+)?(?:fare|price|cost)\s+(?:between\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
171
+ r'(?:compare|difference)\s+(?:between\s+)?(?:fares?|prices?|costs?)\s+(?:from\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
172
+ # Simpler patterns for comparison
173
+ r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
174
+ r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)\s+(?:and|vs|versus)\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
175
+ ]
176
+
177
+ fare_patterns = [
178
+ r'(?:fare|price|cost)\s+(?:of|from)?\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
179
+ r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
180
+ r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
181
+ ]
182
+
183
+ general_patterns = [
184
+ r'from\s+([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
185
+ r'([a-zA-Z\s]+?)\s+to\s+([a-zA-Z\s]+?)(?:\s|$|\?)',
186
+ r'between\s+([a-zA-Z\s]+?)\s+and\s+([a-zA-Z\s]+?)(?:\s|$|\?)'
187
+ ]
188
+
189
+ # Use a set to avoid duplicates
190
+ seen_locations = set()
191
+
192
+ # Try comparison patterns first (highest priority)
193
+ for pattern in comparison_patterns:
194
+ matches = re.finditer(pattern, query, re.IGNORECASE)
195
+ for match in matches:
196
+ locations = [loc.strip() for loc in match.groups() if loc.strip()]
197
+ for loc in locations:
198
+ # Skip if we've already processed this location
199
+ if loc.lower() in seen_locations:
200
+ continue
201
+ seen_locations.add(loc.lower())
202
+
203
+ corrected, confidence, method = self.spell_corrector.correct_location(loc)
204
+ if confidence > 0.5:
205
+ entities['locations'].append({
206
+ 'original': loc,
207
+ 'corrected': corrected,
208
+ 'confidence': confidence,
209
+ 'method': method
210
+ })
211
+
212
+ # If no locations found with comparison patterns, try fare patterns
213
+ if not entities['locations']:
214
+ for pattern in fare_patterns:
215
+ matches = re.finditer(pattern, query, re.IGNORECASE)
216
+ for match in matches:
217
+ locations = [loc.strip() for loc in match.groups() if loc.strip()]
218
+ for loc in locations:
219
+ # Skip if we've already processed this location
220
+ if loc.lower() in seen_locations:
221
+ continue
222
+ seen_locations.add(loc.lower())
223
+
224
+ corrected, confidence, method = self.spell_corrector.correct_location(loc)
225
+ if confidence > 0.5:
226
+ entities['locations'].append({
227
+ 'original': loc,
228
+ 'corrected': corrected,
229
+ 'confidence': confidence,
230
+ 'method': method
231
+ })
232
+
233
+ # If no locations found with fare patterns, try general patterns
234
+ if not entities['locations']:
235
+ for pattern in general_patterns:
236
+ matches = re.finditer(pattern, query, re.IGNORECASE)
237
+ for match in matches:
238
+ locations = [loc.strip() for loc in match.groups() if loc.strip()]
239
+ for loc in locations:
240
+ # Skip if we've already processed this location
241
+ if loc.lower() in seen_locations:
242
+ continue
243
+ seen_locations.add(loc.lower())
244
+
245
+ corrected, confidence, method = self.spell_corrector.correct_location(loc)
246
+ if confidence > 0.5:
247
+ entities['locations'].append({
248
+ 'original': loc,
249
+ 'corrected': corrected,
250
+ 'confidence': confidence,
251
+ 'method': method
252
+ })
253
+
254
+
255
+
256
+
257
+
258
+ # Extract numbers and currencies
259
+ number_patterns = [
260
+ r'(under|below|less\s+than|over|above|more\s+than)\s+(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?',
261
+ r'between\s+(\d+(?:,\d+)*(?:\.\d+)?)\s+and\s+(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?',
262
+ r'(\d+(?:,\d+)*(?:\.\d+)?)\s*(rupees?|rs?|lkr)?'
263
+ ]
264
+
265
+ for pattern in number_patterns:
266
+ matches = re.finditer(pattern, query, re.IGNORECASE)
267
+ for match in matches:
268
+ groups = match.groups()
269
+ if len(groups) >= 2:
270
+ if groups[0] in ['under', 'below', 'less than', 'over', 'above', 'more than']:
271
+ # Pattern: (under|below|less than|over|above|more than) (number) (currency)
272
+ comparator = groups[0]
273
+ number = groups[1]
274
+ currency = groups[2] if len(groups) >= 3 else 'rupees'
275
+
276
+ entities['numbers'].append({
277
+ 'value': float(number.replace(',', '')),
278
+ 'currency': currency,
279
+ 'comparator': comparator
280
+ })
281
+ elif 'between' in pattern:
282
+ # Pattern: between (number1) and (number2) (currency)
283
+ min_number = groups[0]
284
+ max_number = groups[1]
285
+ currency = groups[2] if len(groups) >= 3 else 'rupees'
286
+
287
+ entities['numbers'].append({
288
+ 'value': float(min_number.replace(',', '')),
289
+ 'currency': currency,
290
+ 'comparator': 'between_min'
291
+ })
292
+ entities['numbers'].append({
293
+ 'value': float(max_number.replace(',', '')),
294
+ 'currency': currency,
295
+ 'comparator': 'between_max'
296
+ })
297
+ else:
298
+ # Pattern: (number) (currency)
299
+ number = groups[0]
300
+ currency = groups[1] if len(groups) >= 2 else 'rupees'
301
+
302
+ entities['numbers'].append({
303
+ 'value': float(number.replace(',', '')),
304
+ 'currency': currency,
305
+ 'comparator': None
306
+ })
307
+
308
+ # Extract comparators
309
+ comparator_patterns = [
310
+ r'(cheaper|more\s+expensive|better|worse|faster|slower)',
311
+ r'(compare|difference|vs|versus)',
312
+ r'(under|below|less\s+than|over|above|more\s+than)'
313
+ ]
314
+
315
+ for pattern in comparator_patterns:
316
+ matches = re.finditer(pattern, query, re.IGNORECASE)
317
+ for match in matches:
318
+ entities['comparators'].append(match.group(1).lower())
319
+
320
+ return entities
321
+
322
+ def _classify_intent(self, query: str, entities: Dict = None) -> Dict[str, Any]:
323
+ """Classify the intent of the query"""
324
+ intent_scores = {}
325
+
326
+ for intent, keywords in self.intent_keywords.items():
327
+ score = 0
328
+ for keyword in keywords:
329
+ if keyword in query:
330
+ score += 1
331
+ intent_scores[intent] = score
332
+
333
+ # Get primary intent
334
+ primary_intent = max(intent_scores.items(), key=lambda x: x[1])
335
+
336
+ # Check for specific patterns with priority
337
+ if any(pattern in query for pattern in ['compare', 'difference', 'vs', 'versus', 'cheaper', 'more expensive']):
338
+ primary_intent = ('comparison', 10)
339
+ elif any(pattern in query for pattern in ['recommend', 'suggest', 'best', 'optimal', 'popular']):
340
+ primary_intent = ('recommendation', 10)
341
+ elif any(pattern in query for pattern in ['between', 'under', 'over', 'above', 'below', 'range']):
342
+ primary_intent = ('range_search', 10)
343
+ elif any(pattern in query for pattern in ['fare', 'price', 'cost', 'how much']):
344
+ # Check if we have at least 2 locations
345
+ if entities and len(entities.get('locations', [])) >= 2:
346
+ primary_intent = ('fare_inquiry', 10)
347
+ elif any(pattern in query for pattern in ['route', 'bus', 'train', 'transport']):
348
+ primary_intent = ('route_inquiry', 10)
349
+
350
+ return {
351
+ 'primary': primary_intent[0],
352
+ 'confidence': primary_intent[1] / 10,
353
+ 'all_scores': intent_scores
354
+ }
355
+
356
+ def _generate_cypher_query(self, intent: Dict, entities: Dict, query: str) -> Optional[str]:
357
+ """Generate Cypher query using LLM for better understanding"""
358
+ try:
359
+ # Try LLM-based query generation first
360
+ llm_query = self._generate_cypher_with_llm(query, intent, entities)
361
+ if llm_query:
362
+ return llm_query
363
+ except Exception as e:
364
+ print(f"LLM query generation failed: {e}")
365
+
366
+ # Fallback to rule-based generation
367
+ primary_intent = intent['primary']
368
+
369
+ if primary_intent == 'fare_inquiry':
370
+ return self._generate_fare_query(entities)
371
+ elif primary_intent == 'comparison':
372
+ return self._generate_comparison_query(entities)
373
+ elif primary_intent == 'route_inquiry':
374
+ return self._generate_route_query(entities, query)
375
+ elif primary_intent == 'statistics':
376
+ return self._generate_statistics_query(entities)
377
+ elif primary_intent == 'recommendation':
378
+ return self._generate_recommendation_query(entities, query)
379
+ elif primary_intent == 'range_search':
380
+ return self._generate_range_query(entities)
381
+ else:
382
+ return self._generate_fallback_query(query)
383
+
384
+ def _generate_fare_query(self, entities: Dict) -> Optional[str]:
385
+ """Generate fare inquiry Cypher query"""
386
+ locations = entities.get('locations', [])
387
+
388
+ if len(locations) >= 2:
389
+ from_loc = locations[0]['corrected']
390
+ to_loc = locations[1]['corrected']
391
+
392
+ return f"""
393
+ MATCH (a:Place {{name: '{from_loc}'}})-[r:Fare]->(b:Place {{name: '{to_loc}'}})
394
+ RETURN
395
+ a.name as from_place,
396
+ b.name as to_place,
397
+ r.fare as fare,
398
+ 'Direct route' as route_type
399
+ """
400
+
401
+ return None
402
+
403
+ def _generate_comparison_query(self, entities: Dict) -> Optional[str]:
404
+ """Generate comparison Cypher query"""
405
+ locations = entities.get('locations', [])
406
+
407
+ if len(locations) >= 3:
408
+ # Handle case where we have same origin, different destinations
409
+ if len(locations) == 3:
410
+ # Pattern: "Colombo to Kandy and Colombo to Anuradapura"
411
+ route1_from = locations[0]['corrected']
412
+ route1_to = locations[1]['corrected']
413
+ route2_from = locations[0]['corrected'] # Same origin
414
+ route2_to = locations[2]['corrected']
415
+ elif len(locations) >= 4:
416
+ # Pattern: "Colombo to Kandy and Anuradapura to Galle"
417
+ route1_from = locations[0]['corrected']
418
+ route1_to = locations[1]['corrected']
419
+ route2_from = locations[2]['corrected']
420
+ route2_to = locations[3]['corrected']
421
+ else:
422
+ return None
423
+
424
+ return f"""
425
+ MATCH (a1:Place {{name: '{route1_from}'}})-[r1:Fare]->(b1:Place {{name: '{route1_to}'}})
426
+ MATCH (a2:Place {{name: '{route2_from}'}})-[r2:Fare]->(b2:Place {{name: '{route2_to}'}})
427
+ RETURN
428
+ a1.name + ' to ' + b1.name as route1,
429
+ r1.fare as fare1,
430
+ a2.name + ' to ' + b2.name as route2,
431
+ r2.fare as fare2,
432
+ r1.fare - r2.fare as difference,
433
+ CASE
434
+ WHEN r1.fare < r2.fare THEN 'Route 1 is cheaper'
435
+ WHEN r1.fare > r2.fare THEN 'Route 2 is cheaper'
436
+ ELSE 'Both routes have the same fare'
437
+ END as comparison
438
+ """
439
+
440
+ return None
441
+
442
+ def _generate_route_query(self, entities: Dict, query: str) -> Optional[str]:
443
+ """Generate route inquiry Cypher query"""
444
+ locations = entities.get('locations', [])
445
+
446
+ if 'from' in query and locations:
447
+ location = locations[0]['corrected']
448
+ return f"""
449
+ MATCH (a:Place {{name: '{location}'}})-[r:Fare]->(b:Place)
450
+ RETURN
451
+ a.name as from_place,
452
+ b.name as to_place,
453
+ r.fare as fare
454
+ ORDER BY r.fare
455
+ """
456
+ elif 'to' in query and locations:
457
+ location = locations[0]['corrected']
458
+ return f"""
459
+ MATCH (a:Place)-[r:Fare]->(b:Place {{name: '{location}'}})
460
+ RETURN
461
+ a.name as from_place,
462
+ b.name as to_place,
463
+ r.fare as fare
464
+ ORDER BY r.fare
465
+ """
466
+
467
+ return None
468
+
469
+ def _generate_statistics_query(self, entities: Dict) -> str:
470
+ """Generate statistics Cypher query"""
471
+ return """
472
+ MATCH (p:Place)
473
+ MATCH ()-[r:Fare]->()
474
+ RETURN
475
+ count(DISTINCT p) as total_places,
476
+ count(r) as total_routes,
477
+ round(avg(r.fare), 2) as average_fare,
478
+ min(r.fare) as minimum_fare,
479
+ max(r.fare) as maximum_fare,
480
+ round(stdDev(r.fare), 2) as fare_standard_deviation
481
+ """
482
+
483
+ def _generate_recommendation_query(self, entities: Dict, query: str) -> str:
484
+ """Generate recommendation Cypher query"""
485
+ if 'cheap' in query or 'budget' in query or 'affordable' in query:
486
+ return """
487
+ MATCH (a:Place)-[r:Fare]->(b:Place)
488
+ RETURN
489
+ a.name as from_place,
490
+ b.name as to_place,
491
+ r.fare as fare
492
+ ORDER BY r.fare ASC
493
+ LIMIT 10
494
+ """
495
+ elif 'popular' in query or 'frequent' in query:
496
+ return """
497
+ MATCH (a:Place)-[r:Fare]->(b:Place)
498
+ RETURN
499
+ a.name as from_place,
500
+ b.name as to_place,
501
+ r.fare as fare
502
+ ORDER BY r.fare DESC
503
+ LIMIT 10
504
+ """
505
+ else:
506
+ return """
507
+ MATCH (a:Place)-[r:Fare]->(b:Place)
508
+ RETURN
509
+ a.name as from_place,
510
+ b.name as to_place,
511
+ r.fare as fare
512
+ ORDER BY r.fare ASC
513
+ LIMIT 5
514
+ """
515
+
516
+ def _generate_range_query(self, entities: Dict) -> Optional[str]:
517
+ """Generate range search Cypher query"""
518
+ numbers = entities.get('numbers', [])
519
+
520
+ if numbers:
521
+ # Check for between range
522
+ between_min = None
523
+ between_max = None
524
+ single_value = None
525
+ single_comparator = None
526
+
527
+ for number in numbers:
528
+ comparator = number.get('comparator', '')
529
+ value = number['value']
530
+
531
+ if comparator == 'between_min':
532
+ between_min = value
533
+ elif comparator == 'between_max':
534
+ between_max = value
535
+ elif comparator in ['under', 'below', 'less than', 'over', 'above', 'more than']:
536
+ single_value = value
537
+ single_comparator = comparator
538
+
539
+ # Generate query based on type
540
+ if between_min is not None and between_max is not None:
541
+ return f"""
542
+ MATCH (a:Place)-[r:Fare]->(b:Place)
543
+ WHERE r.fare >= {between_min} AND r.fare <= {between_max}
544
+ RETURN
545
+ a.name as from_place,
546
+ b.name as to_place,
547
+ r.fare as fare
548
+ ORDER BY r.fare ASC
549
+ """
550
+ elif single_value is not None and single_comparator is not None:
551
+ if single_comparator in ['under', 'below', 'less than']:
552
+ return f"""
553
+ MATCH (a:Place)-[r:Fare]->(b:Place)
554
+ WHERE r.fare < {single_value}
555
+ RETURN
556
+ a.name as from_place,
557
+ b.name as to_place,
558
+ r.fare as fare
559
+ ORDER BY r.fare ASC
560
+ """
561
+ elif single_comparator in ['over', 'above', 'more than']:
562
+ return f"""
563
+ MATCH (a:Place)-[r:Fare]->(b:Place)
564
+ WHERE r.fare > {single_value}
565
+ RETURN
566
+ a.name as from_place,
567
+ b.name as to_place,
568
+ r.fare as fare
569
+ ORDER BY r.fare DESC
570
+ """
571
+
572
+ return None
573
+
574
+ def _generate_cypher_with_llm(self, query: str, intent: Dict, entities: Dict) -> Optional[str]:
575
+ """Generate Cypher query using LLM for better understanding"""
576
+ try:
577
+ if not self.config.OPENAI_API_KEY:
578
+ return None
579
+
580
+ # Get available places for context
581
+ available_places = list(self.neo4j_service.get_all_places())
582
+
583
+ # Create comprehensive prompt for Cypher generation
584
+ prompt = f"""
585
+ You are a Neo4j Cypher query generator for a transport database.
586
+
587
+ Database Schema:
588
+ - Nodes: Place (with property 'name')
589
+ - Relationships: Fare (with property 'fare')
590
+
591
+ Available Places: {', '.join(available_places[:50])}... (total: {len(available_places)})
592
+
593
+ User Query: "{query}"
594
+ Detected Intent: {intent.get('primary', 'unknown')}
595
+ Extracted Entities: {entities}
596
+
597
+ Your task is to generate a valid Cypher query that answers the user's question.
598
+
599
+ Query Types and Examples:
600
+
601
+ 1. FARE INQUIRY:
602
+ - "What is the fare from Colombo to Kandy?"
603
+ - Cypher: MATCH (a:Place {{name: 'Colombo'}})-[r:Fare]->(b:Place {{name: 'Kandy'}}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
604
+
605
+ 2. COMPARISON:
606
+ - "Compare fares from Colombo to Kandy vs Colombo to Galle"
607
+ - Cypher: MATCH (a1:Place {{name: 'Colombo'}})-[r1:Fare]->(b1:Place {{name: 'Kandy'}}) MATCH (a2:Place {{name: 'Colombo'}})-[r2:Fare]->(b2:Place {{name: 'Galle'}}) RETURN a1.name + ' to ' + b1.name as route1, r1.fare as fare1, a2.name + ' to ' + b2.name as route2, r2.fare as fare2, r1.fare - r2.fare as difference
608
+
609
+ 3. RANGE SEARCH:
610
+ - "Find routes under 500 rupees"
611
+ - Cypher: MATCH (a:Place)-[r:Fare]->(b:Place) WHERE r.fare < 500 RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC
612
+
613
+ 4. RECOMMENDATION:
614
+ - "Recommend cheap routes"
615
+ - Cypher: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 10
616
+
617
+ 5. STATISTICS:
618
+ - "What is the average fare?"
619
+ - Cypher: MATCH ()-[r:Fare]->() RETURN round(avg(r.fare), 2) as average_fare, min(r.fare) as min_fare, max(r.fare) as max_fare
620
+
621
+ 6. ROUTE INQUIRY:
622
+ - "Routes from Colombo"
623
+ - Cypher: MATCH (a:Place {{name: 'Colombo'}})-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare
624
+
625
+ Important Rules:
626
+ 1. Always use proper Cypher syntax
627
+ 2. Use exact place names from the available places list
628
+ 3. For comparisons, use multiple MATCH clauses
629
+ 4. For ranges, use WHERE clauses with appropriate operators
630
+ 5. For statistics, use aggregation functions
631
+ 6. Always include meaningful column aliases
632
+ 7. Use ORDER BY for sorted results
633
+ 8. Use LIMIT for large result sets
634
+
635
+ Return ONLY the Cypher query, nothing else. If you cannot generate a valid query, return "FALLBACK".
636
+ """
637
+
638
+ cypher_query = None
639
+ # Prefer new SDK
640
+ try:
641
+ from openai import OpenAI
642
+ client = OpenAI(api_key=self.config.OPENAI_API_KEY)
643
+ response = client.chat.completions.create(
644
+ model=self.config.OPENAI_MODEL,
645
+ messages=[
646
+ {"role": "system", "content": "You are a Cypher query generator. Return only valid Cypher queries."},
647
+ {"role": "user", "content": prompt}
648
+ ],
649
+ max_tokens=300,
650
+ temperature=0.1
651
+ )
652
+ cypher_query = response.choices[0].message.content.strip()
653
+ except Exception as sdk_err:
654
+ import openai
655
+ try:
656
+ openai.api_key = self.config.OPENAI_API_KEY
657
+ response = openai.ChatCompletion.create(
658
+ model=self.config.OPENAI_MODEL,
659
+ messages=[
660
+ {"role": "system", "content": "You are a Cypher query generator. Return only valid Cypher queries."},
661
+ {"role": "user", "content": prompt}
662
+ ],
663
+ max_tokens=300,
664
+ temperature=0.1
665
+ )
666
+ cypher_query = response.choices[0].message.content.strip()
667
+ except Exception:
668
+ raise sdk_err
669
+
670
+ # Validate the response
671
+ if cypher_query.upper() == "FALLBACK":
672
+ return None
673
+
674
+ # Basic validation - check if it starts with MATCH
675
+ if cypher_query.upper().startswith('MATCH'):
676
+ return cypher_query
677
+
678
+ return None
679
+
680
+ except Exception as e:
681
+ print(f"LLM Cypher generation error: {e}")
682
+ return None
683
+
684
+ def _generate_fallback_query(self, query: str) -> Optional[str]:
685
+ """Generate fallback query when intent is unclear"""
686
+ # Try to extract locations using spell corrector
687
+ locations = self.spell_corrector.extract_locations_from_query(query)
688
+
689
+ if len(locations) >= 2:
690
+ from_loc = locations[0][1]
691
+ to_loc = locations[1][1]
692
+ return f"""
693
+ MATCH (a:Place {{name: '{from_loc}'}})-[r:Fare]->(b:Place {{name: '{to_loc}'}})
694
+ RETURN
695
+ a.name as from_place,
696
+ b.name as to_place,
697
+ r.fare as fare
698
+ """
699
+
700
+ # Additional fallback: direct pattern matching for fare queries
701
+ if 'fare' in query.lower() or 'price' in query.lower() or 'cost' in query.lower():
702
+ import re
703
+ fare_patterns = [
704
+ r'fare\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
705
+ r'price\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
706
+ r'cost\s+(?:of|from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
707
+ r'(?:what\s+is\s+)?(?:the\s+)?(?:fare|price|cost)(?:\s+of)?(?:\s+from)?\s+([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)',
708
+ r'(?:how\s+much\s+)?(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z\s]+)\s+to\s+([a-zA-Z\s]+)'
709
+ ]
710
+
711
+ for pattern in fare_patterns:
712
+ match = re.search(pattern, query.lower())
713
+ if match:
714
+ from_loc = match.group(1).strip()
715
+ to_loc = match.group(2).strip()
716
+
717
+ # Correct locations
718
+ from_corrected, from_conf, _ = self.spell_corrector.correct_location(from_loc)
719
+ to_corrected, to_conf, _ = self.spell_corrector.correct_location(to_loc)
720
+
721
+ if from_conf > 0.5 and to_conf > 0.5:
722
+ return f"""
723
+ MATCH (a:Place {{name: '{from_corrected}'}})-[r:Fare]->(b:Place {{name: '{to_corrected}'}})
724
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
725
+ """
726
+
727
+ return None
728
+
729
+ def _execute_query(self, cypher_query: str) -> List[Dict]:
730
+ """Execute Cypher query and return results"""
731
+ try:
732
+ with self.neo4j_service.driver.session() as session:
733
+ result = session.run(cypher_query)
734
+ return [dict(record) for record in result]
735
+ except Exception as e:
736
+ print(f"Query execution error: {e}")
737
+ return []
738
+
739
+ def _format_response(self, intent: Dict, entities: Dict, results: List[Dict], query: str) -> Dict[str, Any]:
740
+ """Format the response based on intent and results"""
741
+ primary_intent = intent['primary']
742
+
743
+ if not results:
744
+ return {
745
+ 'success': False,
746
+ 'message': 'No results found for your query.',
747
+ 'suggestions': self._get_suggestions()
748
+ }
749
+
750
+ if primary_intent == 'fare_inquiry':
751
+ return self._format_fare_response(results, entities)
752
+ elif primary_intent == 'comparison':
753
+ return self._format_comparison_response(results, entities)
754
+ elif primary_intent == 'route_inquiry':
755
+ return self._format_route_response(results, entities)
756
+ elif primary_intent == 'statistics':
757
+ return self._format_statistics_response(results)
758
+ elif primary_intent == 'recommendation':
759
+ return self._format_recommendation_response(results, query)
760
+ elif primary_intent == 'range_search':
761
+ return self._format_range_response(results, entities)
762
+ else:
763
+ return self._format_generic_response(results)
764
+
765
+ def _format_fare_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
766
+ """Format fare inquiry response"""
767
+ if results:
768
+ result = results[0]
769
+ return {
770
+ 'success': True,
771
+ 'message': f"The fare from {result['from_place']} to {result['to_place']} is Rs. {result['fare']}",
772
+ 'data': results,
773
+ 'query_type': 'fare_inquiry',
774
+ 'summary': {
775
+ 'from_place': result['from_place'],
776
+ 'to_place': result['to_place'],
777
+ 'fare': result['fare']
778
+ }
779
+ }
780
+ return {'success': False, 'message': 'Fare information not found.'}
781
+
782
+ def _format_comparison_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
783
+ """Format comparison response"""
784
+ if results:
785
+ result = results[0]
786
+ return {
787
+ 'success': True,
788
+ 'message': result.get('comparison', 'Comparison completed'),
789
+ 'data': results,
790
+ 'query_type': 'comparison',
791
+ 'summary': {
792
+ 'route1': result.get('route1'),
793
+ 'route2': result.get('route2'),
794
+ 'difference': result.get('difference')
795
+ }
796
+ }
797
+ return {'success': False, 'message': 'Comparison not possible.'}
798
+
799
+ def _format_route_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
800
+ """Format route inquiry response"""
801
+ return {
802
+ 'success': True,
803
+ 'message': f"Found {len(results)} routes",
804
+ 'data': results,
805
+ 'query_type': 'route_inquiry',
806
+ 'summary': {
807
+ 'total_routes': len(results),
808
+ 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
809
+ }
810
+ }
811
+
812
+ def _format_statistics_response(self, results: List[Dict]) -> Dict[str, Any]:
813
+ """Format statistics response"""
814
+ if results:
815
+ stats = results[0]
816
+ return {
817
+ 'success': True,
818
+ 'message': f"Database contains {stats['total_places']} places and {stats['total_routes']} routes",
819
+ 'data': results,
820
+ 'query_type': 'statistics',
821
+ 'summary': {
822
+ 'total_places': stats['total_places'],
823
+ 'total_routes': stats['total_routes'],
824
+ 'average_fare': stats['average_fare'],
825
+ 'fare_range': f"Rs. {stats['minimum_fare']} - Rs. {stats['maximum_fare']}"
826
+ }
827
+ }
828
+ return {'success': False, 'message': 'Statistics not available.'}
829
+
830
+ def _format_recommendation_response(self, results: List[Dict], query: str) -> Dict[str, Any]:
831
+ """Format recommendation response"""
832
+ return {
833
+ 'success': True,
834
+ 'message': f"Here are {len(results)} recommended routes",
835
+ 'data': results,
836
+ 'query_type': 'recommendation',
837
+ 'summary': {
838
+ 'recommendations_count': len(results),
839
+ 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
840
+ }
841
+ }
842
+
843
+ def _format_range_response(self, results: List[Dict], entities: Dict) -> Dict[str, Any]:
844
+ """Format range search response"""
845
+ return {
846
+ 'success': True,
847
+ 'message': f"Found {len(results)} routes in your specified range",
848
+ 'data': results,
849
+ 'query_type': 'range_search',
850
+ 'summary': {
851
+ 'routes_found': len(results),
852
+ 'fare_range': f"Rs. {min(r['fare'] for r in results)} - Rs. {max(r['fare'] for r in results)}" if results else "N/A"
853
+ }
854
+ }
855
+
856
+ def _format_generic_response(self, results: List[Dict]) -> Dict[str, Any]:
857
+ """Format generic response"""
858
+ return {
859
+ 'success': True,
860
+ 'message': f"Found {len(results)} results",
861
+ 'data': results,
862
+ 'query_type': 'generic'
863
+ }
864
+
865
+ def _handle_unclear_query(self, query: str) -> Dict[str, Any]:
866
+ """Handle unclear or ambiguous queries"""
867
+ return {
868
+ 'success': False,
869
+ 'message': 'I could not understand your query. Please try rephrasing it.',
870
+ 'suggestions': self._get_suggestions(),
871
+ 'query_type': 'unclear'
872
+ }
873
+
874
+ def _calculate_confidence(self, intent: Dict, entities: Dict) -> float:
875
+ """Calculate confidence score for the query interpretation"""
876
+ confidence = 0.0
877
+
878
+ # Intent confidence
879
+ confidence += intent.get('confidence', 0) * 0.4
880
+
881
+ # Entity confidence
882
+ locations = entities.get('locations', [])
883
+ if locations:
884
+ avg_location_confidence = sum(loc['confidence'] for loc in locations) / len(locations)
885
+ confidence += avg_location_confidence * 0.4
886
+
887
+ # Query complexity bonus
888
+ if len(locations) >= 2:
889
+ confidence += 0.2
890
+
891
+ return min(confidence, 1.0)
892
+
893
+ def _get_suggestions(self) -> List[str]:
894
+ """Get query suggestions"""
895
+ return [
896
+ "What is the fare from Colombo to Kandy?",
897
+ "Compare fares from Colombo to Kandy vs Colombo to Galle",
898
+ "Show me routes from Panadura",
899
+ "Find routes under 500 rupees",
900
+ "What are the cheapest routes?",
901
+ "Show me popular destinations",
902
+ "Give me database statistics",
903
+ "Recommend affordable routes"
904
+ ]
language_detector.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Language Detection Service
4
+ Auto-detects user input language: Sinhala, Singlish, English, Tamil
5
+ """
6
+
7
+ import re
8
+ from typing import Dict, Any, Optional, Tuple
9
+ from logger import get_logger
10
+
11
+ class LanguageDetector:
12
+ """Detects language of user input with support for Sinhala, Singlish, English, and Tamil"""
13
+
14
+ def __init__(self):
15
+ self.logger = get_logger(self.__class__.__name__)
16
+
17
+ # Unicode ranges for different scripts
18
+ self.script_ranges = {
19
+ 'sinhala': re.compile(r'[\u0D80-\u0DFF]'), # Sinhala script
20
+ 'tamil': re.compile(r'[\u0B80-\u0BFF]'), # Tamil script
21
+ 'english': re.compile(r'[a-zA-Z]'), # Latin script
22
+ 'numbers': re.compile(r'[0-9]'), # Numbers
23
+ 'punctuation': re.compile(r'[^\w\s]') # Punctuation
24
+ }
25
+
26
+ # Common Singlish patterns (Sinhala + English mixed)
27
+ self.singlish_patterns = [
28
+ r'[\u0D80-\u0DFF]+[a-zA-Z]+', # Sinhala followed by English
29
+ r'[a-zA-Z]+[\u0D80-\u0DFF]+', # English followed by Sinhala
30
+ r'[\u0D80-\u0DFF]+\s+[a-zA-Z]+', # Sinhala word followed by English word
31
+ r'[a-zA-Z]+\s+[\u0D80-\u0DFF]+', # English word followed by Sinhala word
32
+ ]
33
+
34
+ # Common Singlish words/phrases
35
+ self.singlish_indicators = [
36
+ 'bus', 'fare', 'price', 'cost', 'route', 'ticket', 'station',
37
+ 'colombo', 'kandy', 'galle', 'matara', 'anuradhapura', 'panadura',
38
+ 'rupees', 'rs', 'lkr', 'how much', 'what is', 'show me', 'find',
39
+ 'from', 'to', 'and', 'or', 'the', 'a', 'an', 'is', 'are', 'was', 'were'
40
+ ]
41
+
42
+ # Tamil transport terms (for detection)
43
+ self.tamil_transport_terms = [
44
+ 'பேருந்து', 'கட்டணம்', 'விலை', 'செலவு', 'பாதை', 'டிக்கெட்', 'நிலையம்',
45
+ 'கொழும்பு', 'கண்டி', 'காலி', 'மாத்தறை', 'அனுராதபுரம்', 'பனதுரை',
46
+ 'ரூபாய்', 'எவ்வளவு', 'என்ன', 'காட்டு', 'கண்டுபிடி', 'இருந்து', 'வரை',
47
+ 'மற்றும்', 'அல்லது', 'இது', 'அது', 'உள்ளது', 'இருக்கிறது'
48
+ ]
49
+
50
+ # Sinhala transport terms (for detection)
51
+ self.sinhala_transport_terms = [
52
+ 'බස්', 'ගාස්තු', 'මිල', 'වාරික', 'මාර්ග', 'ටිකට්', 'නිලය',
53
+ 'කොළඹ', 'මහනුවර', 'ගාල්ල', 'මාතර', 'අනුරාධපුර', 'පානදුර',
54
+ 'රුපියල්', 'කීයද', 'මොනවාද', 'පෙන්වන්න', 'සොයන්න', 'සිට', 'ට',
55
+ 'සහ', 'හෝ', 'මේ', 'ඒ', 'කියලා', 'ඉන්නවා'
56
+ ]
57
+
58
+ def detect_language(self, text: str) -> Dict[str, Any]:
59
+ """
60
+ Detect the language of the input text
61
+
62
+ Args:
63
+ text: Input text to analyze
64
+
65
+ Returns:
66
+ Dictionary with language detection results
67
+ """
68
+ if not text or not text.strip():
69
+ return {
70
+ 'language': 'unknown',
71
+ 'confidence': 0.0,
72
+ 'details': {
73
+ 'script_analysis': {},
74
+ 'pattern_matches': [],
75
+ 'reasoning': 'Empty or whitespace-only text'
76
+ }
77
+ }
78
+
79
+ # Clean and normalize text
80
+ clean_text = text.strip()
81
+
82
+ # Analyze script composition
83
+ script_analysis = self._analyze_scripts(clean_text)
84
+
85
+ # Check for Singlish patterns
86
+ singlish_matches = self._detect_singlish(clean_text)
87
+
88
+ # Determine primary language
89
+ language, confidence, reasoning = self._determine_language(script_analysis, singlish_matches, clean_text)
90
+
91
+ return {
92
+ 'language': language,
93
+ 'confidence': confidence,
94
+ 'details': {
95
+ 'script_analysis': script_analysis,
96
+ 'singlish_matches': singlish_matches,
97
+ 'reasoning': reasoning,
98
+ 'original_text': text,
99
+ 'clean_text': clean_text
100
+ }
101
+ }
102
+
103
+ def _analyze_scripts(self, text: str) -> Dict[str, Any]:
104
+ """Analyze the script composition of the text"""
105
+ analysis = {
106
+ 'sinhala_chars': 0,
107
+ 'tamil_chars': 0,
108
+ 'english_chars': 0,
109
+ 'number_chars': 0,
110
+ 'punctuation_chars': 0,
111
+ 'total_chars': len(text),
112
+ 'sinhala_ratio': 0.0,
113
+ 'tamil_ratio': 0.0,
114
+ 'english_ratio': 0.0,
115
+ 'mixed_script': False
116
+ }
117
+
118
+ for char in text:
119
+ if self.script_ranges['sinhala'].match(char):
120
+ analysis['sinhala_chars'] += 1
121
+ elif self.script_ranges['tamil'].match(char):
122
+ analysis['tamil_chars'] += 1
123
+ elif self.script_ranges['english'].match(char):
124
+ analysis['english_chars'] += 1
125
+ elif self.script_ranges['numbers'].match(char):
126
+ analysis['number_chars'] += 1
127
+ elif self.script_ranges['punctuation'].match(char):
128
+ analysis['punctuation_chars'] += 1
129
+
130
+ # Calculate ratios
131
+ if analysis['total_chars'] > 0:
132
+ analysis['sinhala_ratio'] = analysis['sinhala_chars'] / analysis['total_chars']
133
+ analysis['tamil_ratio'] = analysis['tamil_chars'] / analysis['total_chars']
134
+ analysis['english_ratio'] = analysis['english_chars'] / analysis['total_chars']
135
+
136
+ # Check for mixed script
137
+ script_count = sum([
138
+ analysis['sinhala_chars'] > 0,
139
+ analysis['tamil_chars'] > 0,
140
+ analysis['english_chars'] > 0
141
+ ])
142
+ analysis['mixed_script'] = script_count > 1
143
+
144
+ return analysis
145
+
146
+ def _detect_singlish(self, text: str) -> Dict[str, Any]:
147
+ """Detect Singlish patterns in the text"""
148
+ matches = {
149
+ 'pattern_matches': [],
150
+ 'indicator_words': [],
151
+ 'is_singlish': False,
152
+ 'confidence': 0.0
153
+ }
154
+
155
+ # Check for Singlish patterns
156
+ for pattern in self.singlish_patterns:
157
+ pattern_matches = re.findall(pattern, text)
158
+ if pattern_matches:
159
+ matches['pattern_matches'].extend(pattern_matches)
160
+
161
+ # Check for Singlish indicator words
162
+ text_lower = text.lower()
163
+ for indicator in self.singlish_indicators:
164
+ if indicator in text_lower:
165
+ matches['indicator_words'].append(indicator)
166
+
167
+ # Determine if it's Singlish
168
+ if matches['pattern_matches'] or len(matches['indicator_words']) >= 2:
169
+ matches['is_singlish'] = True
170
+ matches['confidence'] = min(0.9, 0.3 + (len(matches['pattern_matches']) * 0.2) + (len(matches['indicator_words']) * 0.1))
171
+
172
+ return matches
173
+
174
+ def _determine_language(self, script_analysis: Dict, singlish_matches: Dict, text: str) -> Tuple[str, float, str]:
175
+ """Determine the primary language based on analysis"""
176
+
177
+ # High confidence cases
178
+ if singlish_matches['is_singlish'] and singlish_matches['confidence'] > 0.6:
179
+ return 'singlish', singlish_matches['confidence'], 'Detected Singlish patterns and indicator words'
180
+
181
+ # Pure script cases
182
+ if script_analysis['sinhala_ratio'] > 0.7 and script_analysis['tamil_ratio'] == 0:
183
+ return 'sinhala', script_analysis['sinhala_ratio'], 'High Sinhala script ratio'
184
+
185
+ if script_analysis['tamil_ratio'] > 0.7 and script_analysis['sinhala_ratio'] == 0:
186
+ return 'tamil', script_analysis['tamil_ratio'], 'High Tamil script ratio'
187
+
188
+ if script_analysis['english_ratio'] > 0.7 and script_analysis['sinhala_ratio'] == 0 and script_analysis['tamil_ratio'] == 0:
189
+ return 'english', script_analysis['english_ratio'], 'High English script ratio'
190
+
191
+ # Mixed cases with dominant script
192
+ if script_analysis['sinhala_ratio'] > 0.4:
193
+ confidence = script_analysis['sinhala_ratio']
194
+ if script_analysis['english_ratio'] > 0.2:
195
+ return 'singlish', confidence, 'Sinhala-dominant mixed text with English'
196
+ return 'sinhala', confidence, 'Sinhala-dominant text'
197
+
198
+ if script_analysis['tamil_ratio'] > 0.4:
199
+ confidence = script_analysis['tamil_ratio']
200
+ if script_analysis['english_ratio'] > 0.2:
201
+ return 'tamil_english', confidence, 'Tamil-dominant mixed text with English'
202
+ return 'tamil', confidence, 'Tamil-dominant text'
203
+
204
+ if script_analysis['english_ratio'] > 0.4:
205
+ confidence = script_analysis['english_ratio']
206
+ if script_analysis['sinhala_ratio'] > 0.1 or script_analysis['tamil_ratio'] > 0.1:
207
+ return 'singlish', confidence, 'English-dominant mixed text'
208
+ return 'english', confidence, 'English-dominant text'
209
+
210
+ # Fallback: check for specific terms
211
+ text_lower = text.lower()
212
+ sinhala_terms_found = sum(1 for term in self.sinhala_transport_terms if term in text_lower)
213
+ tamil_terms_found = sum(1 for term in self.tamil_transport_terms if term in text_lower)
214
+
215
+ if sinhala_terms_found > tamil_terms_found and sinhala_terms_found > 0:
216
+ return 'sinhala', 0.6, f'Found {sinhala_terms_found} Sinhala transport terms'
217
+
218
+ if tamil_terms_found > sinhala_terms_found and tamil_terms_found > 0:
219
+ return 'tamil', 0.6, f'Found {tamil_terms_found} Tamil transport terms'
220
+
221
+ # Check if it's pure English (no non-Latin characters)
222
+ if script_analysis['english_ratio'] > 0.8 and script_analysis['sinhala_ratio'] == 0 and script_analysis['tamil_ratio'] == 0:
223
+ return 'english', 0.8, 'Pure English text detected'
224
+
225
+ # Default to English if no clear indicators
226
+ return 'english', 0.5, 'Default to English - no clear language indicators'
227
+
228
+ def is_sinhala(self, text: str) -> bool:
229
+ """Quick check if text is Sinhala"""
230
+ result = self.detect_language(text)
231
+ return result['language'] in ['sinhala', 'singlish']
232
+
233
+ def is_tamil(self, text: str) -> bool:
234
+ """Quick check if text is Tamil"""
235
+ result = self.detect_language(text)
236
+ return result['language'] in ['tamil', 'tamil_english']
237
+
238
+ def is_english(self, text: str) -> bool:
239
+ """Quick check if text is English"""
240
+ result = self.detect_language(text)
241
+ return result['language'] == 'english'
242
+
243
+ def is_singlish(self, text: str) -> bool:
244
+ """Quick check if text is Singlish"""
245
+ result = self.detect_language(text)
246
+ return result['language'] == 'singlish'
247
+
248
+ def get_detection_summary(self, text: str) -> str:
249
+ """Get a human-readable summary of language detection"""
250
+ result = self.detect_language(text)
251
+ return f"Language: {result['language']} (confidence: {result['confidence']:.2f}) - {result['details']['reasoning']}"
llm_query_processor.py CHANGED
@@ -1,351 +1,384 @@
1
- #!/usr/bin/env python3
2
- """
3
- LLM-Based Query Processor for Transport Query Application
4
- Uses AI to interpret queries and generate Cypher queries
5
- """
6
-
7
- import re
8
- from typing import Dict, List, Tuple, Optional
9
- from spell_corrector import SpellCorrector
10
- from neo4j_service import Neo4jService
11
- from config import Config
12
-
13
- class LLMQueryProcessor:
14
- """Process natural language queries using LLM for interpretation and Cypher generation"""
15
-
16
- def __init__(self):
17
- self.config = Config()
18
- self.spell_corrector = SpellCorrector()
19
- self.neo4j_service = Neo4jService()
20
-
21
- def process_query(self, user_query: str) -> Dict:
22
- """
23
- Process a natural language query using LLM for interpretation
24
-
25
- Returns:
26
- Dictionary with query results and metadata
27
- """
28
- try:
29
- # First, extract and correct locations from the query
30
- locations = self.spell_corrector.extract_locations_from_query(user_query)
31
-
32
- # Use LLM to interpret the query and generate Cypher
33
- interpretation = self._interpret_query_with_llm(user_query, locations)
34
-
35
- if interpretation['success']:
36
- # Execute the generated Cypher query
37
- result = self._execute_cypher_query(interpretation['cypher_query'])
38
-
39
- return {
40
- 'success': True,
41
- 'message': interpretation['message'],
42
- 'cypher_query': interpretation['cypher_query'],
43
- 'data': result,
44
- 'corrections': self._format_corrections(locations),
45
- 'query_type': interpretation['query_type']
46
- }
47
- else:
48
- return {
49
- 'success': False,
50
- 'message': interpretation['message'],
51
- 'suggestions': self._get_query_suggestions()
52
- }
53
-
54
- except Exception as e:
55
- print(f"Query processing error: {e}")
56
- return {
57
- 'success': False,
58
- 'message': 'An error occurred while processing your query.',
59
- 'suggestions': self._get_query_suggestions()
60
- }
61
-
62
- def _interpret_query_with_llm(self, query: str, locations: List[Tuple]) -> Dict:
63
- """Use LLM to interpret the query and generate appropriate Cypher"""
64
- try:
65
- if not self.config.OPENAI_API_KEY:
66
- return self._fallback_interpretation(query, locations)
67
-
68
- # Get available places for context
69
- available_places = list(self.neo4j_service.get_all_places())
70
-
71
- # Create comprehensive prompt for query interpretation
72
- prompt = f"""
73
- You are an intelligent transport query interpreter for a Neo4j database containing Sri Lankan transport data.
74
-
75
- Database Schema:
76
- - Nodes: Place (with property 'name')
77
- - Relationships: Fare (with property 'fare')
78
-
79
- Available Places: {', '.join(available_places[:50])}... (total: {len(available_places)})
80
-
81
- User Query: "{query}"
82
-
83
- Extracted Locations: {[f"{orig}->{corr}" for orig, corr, conf, method in locations]}
84
-
85
- Your task is to:
86
- 1. Determine the query type (fare, cheapest, expensive, places, routes_from, routes_to, statistics, lowest_fare)
87
- 2. Generate the appropriate Cypher query
88
- 3. Provide a clear response message
89
-
90
- Query Types:
91
- - fare: Find fare between two specific locations
92
- - cheapest: Find cheapest routes (top 10)
93
- - expensive: Find most expensive routes (top 10)
94
- - places: List all places
95
- - routes_from: Find routes departing from a location
96
- - routes_to: Find routes arriving at a location
97
- - statistics: Get database statistics
98
- - lowest_fare: Find the single lowest fare with route details
99
-
100
- Return your response in this exact JSON format:
101
- {{
102
- "query_type": "fare|cheapest|expensive|places|routes_from|routes_to|statistics|lowest_fare",
103
- "cypher_query": "MATCH ... RETURN ...",
104
- "message": "Clear response message for the user"
105
- }}
106
-
107
- Examples:
108
- - "What is the fare from Colombo to Kandy?" → fare query: MATCH (a:Place {name: 'Colombo'})-[r:Fare]->(b:Place {name: 'Kandy'}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
109
- - "fare of anuradhapura to kandy?" → fare query: MATCH (a:Place {name: 'Anuradapura'})-[r:Fare]->(b:Place {name: 'Kandy'}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
110
- - "Show me the cheapest routes" cheapest query: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 10
111
- - "What is the lowest fare?" → lowest_fare query: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 1
112
- - "List all places" → places query: MATCH (p:Place) RETURN DISTINCT p.name as place ORDER BY p.name
113
- - "Routes from Colombo" → routes_from query: MATCH (a:Place {name: 'Colombo'})-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare
114
- - "Database statistics" → statistics query: MATCH (p:Place) MATCH ()-[r:Fare]->() RETURN count(DISTINCT p) as total_places, count(r) as total_routes, avg(r.fare) as average_fare, min(r.fare) as min_fare, max(r.fare) as max_fare
115
-
116
- Keep Cypher queries simple and avoid complex functions like shortestPath. Use direct relationships only.
117
-
118
- For fare queries, recognize various formats like "fare of X to Y", "fare from X to Y", "price from X to Y", etc.
119
- """
120
-
121
- # Call LLM using new SDK first, legacy as fallback
122
- import json
123
- interpretation = None
124
- try:
125
- from openai import OpenAI
126
- client = OpenAI(api_key=self.config.OPENAI_API_KEY)
127
- response = client.chat.completions.create(
128
- model=self.config.OPENAI_MODEL,
129
- messages=[
130
- {"role": "system", "content": "You are a transport query interpreter. Return only valid JSON."},
131
- {"role": "user", "content": prompt}
132
- ],
133
- max_tokens=500,
134
- temperature=0.1
135
- )
136
- interpretation = json.loads(response.choices[0].message.content.strip())
137
- except Exception as sdk_err:
138
- try:
139
- import openai
140
- openai.api_key = self.config.OPENAI_API_KEY
141
- response = openai.ChatCompletion.create(
142
- model=self.config.OPENAI_MODEL,
143
- messages=[
144
- {"role": "system", "content": "You are a transport query interpreter. Return only valid JSON."},
145
- {"role": "user", "content": prompt}
146
- ],
147
- max_tokens=500,
148
- temperature=0.1
149
- )
150
- interpretation = json.loads(response.choices[0].message.content.strip())
151
- except Exception:
152
- raise sdk_err
153
-
154
- # Validate the response
155
- if interpretation and 'query_type' in interpretation and 'cypher_query' in interpretation and 'message' in interpretation:
156
- return {
157
- 'success': True,
158
- 'query_type': interpretation['query_type'],
159
- 'cypher_query': interpretation['cypher_query'],
160
- 'message': interpretation['message']
161
- }
162
- else:
163
- return self._fallback_interpretation(query, locations)
164
-
165
- except Exception as e:
166
- print(f"LLM interpretation error: {e}")
167
- return self._fallback_interpretation(query, locations)
168
-
169
- def _fallback_interpretation(self, query: str, locations: List[Tuple]) -> Dict:
170
- """Fallback interpretation when LLM is not available"""
171
- query_lower = query.lower()
172
-
173
- # Simple keyword-based interpretation
174
- if 'lowest' in query_lower or 'minimum' in query_lower or 'cheapest' in query_lower:
175
- if 'lowest fare' in query_lower or 'minimum fare' in query_lower:
176
- return {
177
- 'success': True,
178
- 'query_type': 'lowest_fare',
179
- 'cypher_query': """
180
- MATCH (a:Place)-[r:Fare]->(b:Place)
181
- WITH a, b, r, r.fare as fare
182
- ORDER BY r.fare ASC
183
- LIMIT 1
184
- RETURN a.name as from_place, b.name as to_place, fare
185
- """,
186
- 'message': 'Finding the lowest fare in the database...'
187
- }
188
- else:
189
- return {
190
- 'success': True,
191
- 'query_type': 'cheapest',
192
- 'cypher_query': """
193
- MATCH (a:Place)-[r:Fare]->(b:Place)
194
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
195
- ORDER BY r.fare ASC
196
- LIMIT 10
197
- """,
198
- 'message': 'Finding the cheapest routes...'
199
- }
200
- elif 'expensive' in query_lower or 'highest' in query_lower or 'maximum' in query_lower:
201
- return {
202
- 'success': True,
203
- 'query_type': 'expensive',
204
- 'cypher_query': """
205
- MATCH (a:Place)-[r:Fare]->(b:Place)
206
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
207
- ORDER BY r.fare DESC
208
- LIMIT 10
209
- """,
210
- 'message': 'Finding the most expensive routes...'
211
- }
212
- elif 'places' in query_lower or 'locations' in query_lower or 'list all' in query_lower:
213
- return {
214
- 'success': True,
215
- 'query_type': 'places',
216
- 'cypher_query': """
217
- MATCH (p:Place)
218
- RETURN DISTINCT p.name as place
219
- ORDER BY p.name
220
- """,
221
- 'message': 'Listing all places...'
222
- }
223
- elif 'statistics' in query_lower or 'stats' in query_lower:
224
- return {
225
- 'success': True,
226
- 'query_type': 'statistics',
227
- 'cypher_query': """
228
- MATCH (p:Place)
229
- MATCH ()-[r:Fare]->()
230
- RETURN
231
- count(DISTINCT p) as total_places,
232
- count(r) as total_routes,
233
- avg(r.fare) as average_fare,
234
- min(r.fare) as min_fare,
235
- max(r.fare) as max_fare
236
- """,
237
- 'message': 'Getting database statistics...'
238
- }
239
- elif len(locations) >= 2:
240
- # Fare query between two locations
241
- from_location = locations[0][1]
242
- to_location = locations[1][1]
243
- return {
244
- 'success': True,
245
- 'query_type': 'fare',
246
- 'cypher_query': f"""
247
- MATCH (a:Place {{name: '{from_location}'}})-[r:Fare]->(b:Place {{name: '{to_location}'}})
248
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
249
- """,
250
- 'message': f'Finding fare from {from_location} to {to_location}...'
251
- }
252
- elif 'fare' in query_lower and 'to' in query_lower:
253
- # Handle queries like "fare of X to Y" where locations might not be extracted properly
254
- # Try to extract locations using a simpler pattern
255
- import re
256
- fare_patterns = [
257
- r'fare\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
258
- r'price\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
259
- r'cost\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
260
- r'how\s+much\s+(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
261
- r'what\s+(?:is|are)\s+(?:the\s+)?(?:fare|price|cost)s?\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
262
- r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+(?:fare|price|cost)',
263
- r'(?:fare|price|cost)\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)'
264
- ]
265
-
266
- for pattern in fare_patterns:
267
- match = re.search(pattern, query_lower)
268
- if match:
269
- from_loc = match.group(1).strip()
270
- to_loc = match.group(2).strip()
271
-
272
- # Correct the locations
273
- from_corrected, from_conf, _ = self.spell_corrector.correct_location(from_loc)
274
- to_corrected, to_conf, _ = self.spell_corrector.correct_location(to_loc)
275
-
276
- if from_conf > 0.5 and to_conf > 0.5:
277
- return {
278
- 'success': True,
279
- 'query_type': 'fare',
280
- 'cypher_query': f"""
281
- MATCH (a:Place {{name: '{from_corrected}'}})-[r:Fare]->(b:Place {{name: '{to_corrected}'}})
282
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
283
- """,
284
- 'message': f'Finding fare from {from_corrected} to {to_corrected}...'
285
- }
286
- elif len(locations) == 1:
287
- # Routes from/to a single location
288
- location = locations[0][1]
289
- if 'from' in query_lower:
290
- return {
291
- 'success': True,
292
- 'query_type': 'routes_from',
293
- 'cypher_query': f"""
294
- MATCH (a:Place {{name: '{location}'}})-[r:Fare]->(b:Place)
295
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
296
- ORDER BY r.fare
297
- """,
298
- 'message': f'Finding routes from {location}...'
299
- }
300
- else:
301
- return {
302
- 'success': True,
303
- 'query_type': 'routes_to',
304
- 'cypher_query': f"""
305
- MATCH (a:Place)-[r:Fare]->(b:Place {{name: '{location}'}})
306
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
307
- ORDER BY r.fare
308
- """,
309
- 'message': f'Finding routes to {location}...'
310
- }
311
- else:
312
- return {
313
- 'success': False,
314
- 'message': 'I could not understand your query. Please try rephrasing it.'
315
- }
316
-
317
- def _execute_cypher_query(self, cypher_query: str) -> List[Dict]:
318
- """Execute the generated Cypher query"""
319
- try:
320
- with self.neo4j_service.driver.session() as session:
321
- result = session.run(cypher_query)
322
- return [dict(record) for record in result]
323
- except Exception as e:
324
- print(f"Cypher execution error: {e}")
325
- return []
326
-
327
- def _format_corrections(self, locations: List[Tuple]) -> List[Dict]:
328
- """Format location corrections for display"""
329
- corrections = []
330
- for original, corrected, confidence, method in locations:
331
- if original.lower() != corrected.lower():
332
- corrections.append({
333
- 'original': original,
334
- 'corrected': corrected,
335
- 'confidence': confidence,
336
- 'method': method
337
- })
338
- return corrections
339
-
340
- def _get_query_suggestions(self) -> List[str]:
341
- """Get query suggestions"""
342
- return [
343
- "What is the fare from Colombo to Kandy?",
344
- "What is the lowest fare price?",
345
- "Show me the cheapest routes",
346
- "Show me the most expensive routes",
347
- "List all places",
348
- "Routes from Panadura",
349
- "Routes to Galle",
350
- "Database statistics"
351
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM-Based Query Processor for Transport Query Application
4
+ Uses Google Gemini AI to interpret queries and generate Cypher queries
5
+ """
6
+
7
+ import re
8
+ import json
9
+ from typing import Dict, List, Tuple, Optional
10
+ from spell_corrector import SpellCorrector
11
+ from neo4j_service import Neo4jService
12
+ from config import Config
13
+ import google.generativeai as genai
14
+ import os
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ class LLMQueryProcessor:
19
+ """Process natural language queries using LLM for interpretation and Cypher generation"""
20
+
21
+ def __init__(self):
22
+ self.config = Config()
23
+ self.spell_corrector = SpellCorrector()
24
+ self.neo4j_service = Neo4jService()
25
+
26
+ # Configure Google Generative AI
27
+ if hasattr(self.config, 'GOOGLE_API_KEY') and self.config.GOOGLE_API_KEY:
28
+ genai.configure(api_key=self.config.GOOGLE_API_KEY)
29
+ self.google_api_available = True
30
+ else:
31
+ # Fallback to hardcoded API key if not in config
32
+ google_api_key = os.getenv("GOOGLE_API_KEY")
33
+ genai.configure(api_key=google_api_key)
34
+ self.google_api_available = True
35
+
36
+ def process_query(self, user_query: str) -> Dict:
37
+ """
38
+ Process a natural language query using LLM for interpretation
39
+
40
+ Returns:
41
+ Dictionary with query results and metadata
42
+ """
43
+ try:
44
+ # First, extract and correct locations from the query
45
+ locations = self.spell_corrector.extract_locations_from_query(user_query)
46
+
47
+ # Use LLM to interpret the query and generate Cypher
48
+ interpretation = self._interpret_query_with_llm(user_query, locations)
49
+
50
+ if interpretation['success']:
51
+ # Execute the generated Cypher query
52
+ result = self._execute_cypher_query(interpretation['cypher_query'])
53
+
54
+ return {
55
+ 'success': True,
56
+ 'message': interpretation['message'],
57
+ 'cypher_query': interpretation['cypher_query'],
58
+ 'data': result,
59
+ 'corrections': self._format_corrections(locations),
60
+ 'query_type': interpretation['query_type']
61
+ }
62
+ else:
63
+ return {
64
+ 'success': False,
65
+ 'message': interpretation['message'],
66
+ 'suggestions': self._get_query_suggestions()
67
+ }
68
+
69
+ except Exception as e:
70
+ print(f"Query processing error: {e}")
71
+ return {
72
+ 'success': False,
73
+ 'message': 'An error occurred while processing your query.',
74
+ 'suggestions': self._get_query_suggestions()
75
+ }
76
+
77
+ def _interpret_query_with_llm(self, query: str, locations: List[Tuple]) -> Dict:
78
+ """Use Google Gemini AI to interpret the query and generate appropriate Cypher"""
79
+ try:
80
+ if not self.google_api_available:
81
+ return self._fallback_interpretation(query, locations)
82
+
83
+ # Get available places for context
84
+ available_places = list(self.neo4j_service.get_all_places())
85
+
86
+ # Create comprehensive prompt for query interpretation
87
+ prompt = f"""
88
+ You are an intelligent transport query interpreter for a Neo4j database containing Sri Lankan transport data.
89
+
90
+ Database Schema:
91
+ - Nodes: Place (with property 'name')
92
+ - Relationships: Fare (with property 'fare')
93
+
94
+ Available Places: {', '.join(available_places[:50])}... (total: {len(available_places)})
95
+
96
+ User Query: "{query}"
97
+
98
+ Extracted Locations: {[f"{orig}->{corr}" for orig, corr, conf, method in locations]}
99
+
100
+ Your task is to:
101
+ 1. Determine the query type (fare, cheapest, expensive, places, routes_from, routes_to, statistics, lowest_fare)
102
+ 2. Generate the appropriate Cypher query
103
+ 3. Provide a clear response message
104
+
105
+ Query Types:
106
+ - fare: Find fare between two specific locations
107
+ - cheapest: Find cheapest routes (top 10)
108
+ - expensive: Find most expensive routes (top 10)
109
+ - places: List all places
110
+ - routes_from: Find routes departing from a location
111
+ - routes_to: Find routes arriving at a location
112
+ - statistics: Get database statistics
113
+ - lowest_fare: Find the single lowest fare with route details
114
+
115
+ Return your response in this exact JSON format:
116
+ {{
117
+ "query_type": "fare|cheapest|expensive|places|routes_from|routes_to|statistics|lowest_fare",
118
+ "cypher_query": "MATCH ... RETURN ...",
119
+ "message": "Clear response message for the user"
120
+ }}
121
+
122
+ Examples:
123
+ - "What is the fare from Colombo to Kandy?" → fare query: MATCH (a:Place {name: 'Colombo'})-[r:Fare]->(b:Place {name: 'Kandy'}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
124
+ - "fare of anuradhapura to kandy?" → fare query: MATCH (a:Place {name: 'Anuradapura'})-[r:Fare]->(b:Place {name: 'Kandy'}) RETURN a.name as from_place, b.name as to_place, r.fare as fare
125
+ - "Show me the cheapest routes" → cheapest query: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 10
126
+ - "What is the lowest fare?" → lowest_fare query: MATCH (a:Place)-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare ASC LIMIT 1
127
+ - "List all places" → places query: MATCH (p:Place) RETURN DISTINCT p.name as place ORDER BY p.name
128
+ - "Routes from Colombo" → routes_from query: MATCH (a:Place {name: 'Colombo'})-[r:Fare]->(b:Place) RETURN a.name as from_place, b.name as to_place, r.fare as fare ORDER BY r.fare
129
+ - "Database statistics" → statistics query: MATCH (p:Place) MATCH ()-[r:Fare]->() RETURN count(DISTINCT p) as total_places, count(r) as total_routes, avg(r.fare) as average_fare, min(r.fare) as min_fare, max(r.fare) as max_fare
130
+
131
+ Keep Cypher queries simple and avoid complex functions like shortestPath. Use direct relationships only.
132
+
133
+ For fare queries, recognize various formats like "fare of X to Y", "fare from X to Y", "price from X to Y", etc.
134
+ """
135
+
136
+ # Call Google Gemini AI
137
+ interpretation = None
138
+ try:
139
+ # Initialize the Gemini model
140
+ model = genai.GenerativeModel('gemini-1.5-flash')
141
+
142
+ # Create the full prompt with system instructions
143
+ full_prompt = f"""You are a transport query interpreter for a Neo4j database. Return only valid JSON.
144
+
145
+ {prompt}"""
146
+
147
+ # Generate content using Gemini
148
+ response = model.generate_content(
149
+ full_prompt,
150
+ generation_config=genai.types.GenerationConfig(
151
+ max_output_tokens=500,
152
+ temperature=0.1,
153
+ response_mime_type="application/json"
154
+ )
155
+ )
156
+
157
+ # Parse the JSON response
158
+ interpretation = json.loads(response.text.strip())
159
+
160
+ except json.JSONDecodeError as json_err:
161
+ print(f"JSON parsing error: {json_err}")
162
+ # Try to extract JSON from response if it's wrapped in text
163
+ try:
164
+ response_text = response.text.strip()
165
+ # Look for JSON-like content in the response
166
+ json_start = response_text.find('{')
167
+ json_end = response_text.rfind('}') + 1
168
+ if json_start != -1 and json_end > json_start:
169
+ json_content = response_text[json_start:json_end]
170
+ interpretation = json.loads(json_content)
171
+ else:
172
+ raise json_err
173
+ except Exception:
174
+ print(f"Could not parse response: {response.text}")
175
+ raise json_err
176
+
177
+ # Validate the response
178
+ if interpretation and 'query_type' in interpretation and 'cypher_query' in interpretation and 'message' in interpretation:
179
+ return {
180
+ 'success': True,
181
+ 'query_type': interpretation['query_type'],
182
+ 'cypher_query': interpretation['cypher_query'],
183
+ 'message': interpretation['message']
184
+ }
185
+ else:
186
+ return self._fallback_interpretation(query, locations)
187
+
188
+ except Exception as e:
189
+ error_message = str(e)
190
+ print(f"Google Gemini AI interpretation error: {e}")
191
+
192
+ # Handle specific Google API errors
193
+ if "quota" in error_message.lower() or "limit" in error_message.lower():
194
+ print("⚠️ Google API quota exceeded, falling back to rule-based interpretation")
195
+ elif "API_KEY_INVALID" in error_message or "authentication" in error_message.lower():
196
+ print("⚠️ Google API authentication failed, falling back to rule-based interpretation")
197
+ elif "models/gemini" in error_message.lower():
198
+ print("⚠️ Gemini model not available, falling back to rule-based interpretation")
199
+
200
+ return self._fallback_interpretation(query, locations)
201
+
202
+ def _fallback_interpretation(self, query: str, locations: List[Tuple]) -> Dict:
203
+ """Fallback interpretation when LLM is not available"""
204
+ query_lower = query.lower()
205
+
206
+ # Simple keyword-based interpretation
207
+ if 'lowest' in query_lower or 'minimum' in query_lower or 'cheapest' in query_lower:
208
+ if 'lowest fare' in query_lower or 'minimum fare' in query_lower:
209
+ return {
210
+ 'success': True,
211
+ 'query_type': 'lowest_fare',
212
+ 'cypher_query': """
213
+ MATCH (a:Place)-[r:Fare]->(b:Place)
214
+ WITH a, b, r, r.fare as fare
215
+ ORDER BY r.fare ASC
216
+ LIMIT 1
217
+ RETURN a.name as from_place, b.name as to_place, fare
218
+ """,
219
+ 'message': 'Finding the lowest fare in the database...'
220
+ }
221
+ else:
222
+ return {
223
+ 'success': True,
224
+ 'query_type': 'cheapest',
225
+ 'cypher_query': """
226
+ MATCH (a:Place)-[r:Fare]->(b:Place)
227
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
228
+ ORDER BY r.fare ASC
229
+ LIMIT 10
230
+ """,
231
+ 'message': 'Finding the cheapest routes...'
232
+ }
233
+ elif 'expensive' in query_lower or 'highest' in query_lower or 'maximum' in query_lower:
234
+ return {
235
+ 'success': True,
236
+ 'query_type': 'expensive',
237
+ 'cypher_query': """
238
+ MATCH (a:Place)-[r:Fare]->(b:Place)
239
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
240
+ ORDER BY r.fare DESC
241
+ LIMIT 10
242
+ """,
243
+ 'message': 'Finding the most expensive routes...'
244
+ }
245
+ elif 'places' in query_lower or 'locations' in query_lower or 'list all' in query_lower:
246
+ return {
247
+ 'success': True,
248
+ 'query_type': 'places',
249
+ 'cypher_query': """
250
+ MATCH (p:Place)
251
+ RETURN DISTINCT p.name as place
252
+ ORDER BY p.name
253
+ """,
254
+ 'message': 'Listing all places...'
255
+ }
256
+ elif 'statistics' in query_lower or 'stats' in query_lower:
257
+ return {
258
+ 'success': True,
259
+ 'query_type': 'statistics',
260
+ 'cypher_query': """
261
+ MATCH (p:Place)
262
+ MATCH ()-[r:Fare]->()
263
+ RETURN
264
+ count(DISTINCT p) as total_places,
265
+ count(r) as total_routes,
266
+ avg(r.fare) as average_fare,
267
+ min(r.fare) as min_fare,
268
+ max(r.fare) as max_fare
269
+ """,
270
+ 'message': 'Getting database statistics...'
271
+ }
272
+ elif len(locations) >= 2:
273
+ # Fare query between two locations
274
+ from_location = locations[0][1]
275
+ to_location = locations[1][1]
276
+ return {
277
+ 'success': True,
278
+ 'query_type': 'fare',
279
+ 'cypher_query': f"""
280
+ MATCH (a:Place {{name: '{from_location}'}})-[r:Fare]->(b:Place {{name: '{to_location}'}})
281
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
282
+ """,
283
+ 'message': f'Finding fare from {from_location} to {to_location}...'
284
+ }
285
+ elif 'fare' in query_lower and 'to' in query_lower:
286
+ # Handle queries like "fare of X to Y" where locations might not be extracted properly
287
+ # Try to extract locations using a simpler pattern
288
+ import re
289
+ fare_patterns = [
290
+ r'fare\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
291
+ r'price\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
292
+ r'cost\s+(?:of|from)?\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
293
+ r'how\s+much\s+(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
294
+ r'what\s+(?:is|are)\s+(?:the\s+)?(?:fare|price|cost)s?\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)',
295
+ r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+(?:fare|price|cost)',
296
+ r'(?:fare|price|cost)\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)'
297
+ ]
298
+
299
+ for pattern in fare_patterns:
300
+ match = re.search(pattern, query_lower)
301
+ if match:
302
+ from_loc = match.group(1).strip()
303
+ to_loc = match.group(2).strip()
304
+
305
+ # Correct the locations
306
+ from_corrected, from_conf, _ = self.spell_corrector.correct_location(from_loc)
307
+ to_corrected, to_conf, _ = self.spell_corrector.correct_location(to_loc)
308
+
309
+ if from_conf > 0.5 and to_conf > 0.5:
310
+ return {
311
+ 'success': True,
312
+ 'query_type': 'fare',
313
+ 'cypher_query': f"""
314
+ MATCH (a:Place {{name: '{from_corrected}'}})-[r:Fare]->(b:Place {{name: '{to_corrected}'}})
315
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
316
+ """,
317
+ 'message': f'Finding fare from {from_corrected} to {to_corrected}...'
318
+ }
319
+ elif len(locations) == 1:
320
+ # Routes from/to a single location
321
+ location = locations[0][1]
322
+ if 'from' in query_lower:
323
+ return {
324
+ 'success': True,
325
+ 'query_type': 'routes_from',
326
+ 'cypher_query': f"""
327
+ MATCH (a:Place {{name: '{location}'}})-[r:Fare]->(b:Place)
328
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
329
+ ORDER BY r.fare
330
+ """,
331
+ 'message': f'Finding routes from {location}...'
332
+ }
333
+ else:
334
+ return {
335
+ 'success': True,
336
+ 'query_type': 'routes_to',
337
+ 'cypher_query': f"""
338
+ MATCH (a:Place)-[r:Fare]->(b:Place {{name: '{location}'}})
339
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
340
+ ORDER BY r.fare
341
+ """,
342
+ 'message': f'Finding routes to {location}...'
343
+ }
344
+ else:
345
+ return {
346
+ 'success': False,
347
+ 'message': 'I could not understand your query. Please try rephrasing it.'
348
+ }
349
+
350
+ def _execute_cypher_query(self, cypher_query: str) -> List[Dict]:
351
+ """Execute the generated Cypher query"""
352
+ try:
353
+ with self.neo4j_service.driver.session() as session:
354
+ result = session.run(cypher_query)
355
+ return [dict(record) for record in result]
356
+ except Exception as e:
357
+ print(f"Cypher execution error: {e}")
358
+ return []
359
+
360
+ def _format_corrections(self, locations: List[Tuple]) -> List[Dict]:
361
+ """Format location corrections for display"""
362
+ corrections = []
363
+ for original, corrected, confidence, method in locations:
364
+ if original.lower() != corrected.lower():
365
+ corrections.append({
366
+ 'original': original,
367
+ 'corrected': corrected,
368
+ 'confidence': confidence,
369
+ 'method': method
370
+ })
371
+ return corrections
372
+
373
+ def _get_query_suggestions(self) -> List[str]:
374
+ """Get query suggestions"""
375
+ return [
376
+ "What is the fare from Colombo to Kandy?",
377
+ "What is the lowest fare price?",
378
+ "Show me the cheapest routes",
379
+ "Show me the most expensive routes",
380
+ "List all places",
381
+ "Routes from Panadura",
382
+ "Routes to Galle",
383
+ "Database statistics"
384
+ ]
logger.py CHANGED
@@ -1,61 +1,61 @@
1
- #!/usr/bin/env python3
2
- """
3
- Centralized logging setup for the Transport Query Application.
4
- Provides a rotating file handler and console output.
5
- """
6
-
7
- import logging
8
- import os
9
- from logging.handlers import RotatingFileHandler
10
-
11
-
12
- def get_logger(name: str) -> logging.Logger:
13
- """Create or retrieve a configured logger with file and console handlers."""
14
- logger = logging.getLogger(name)
15
-
16
- if getattr(logger, "_configured", False):
17
- return logger
18
-
19
- log_level_str = os.getenv("LOG_LEVEL", "INFO").upper()
20
- log_dir = os.getenv("LOG_DIR", os.path.join(os.path.dirname(__file__), "..", "logs"))
21
-
22
- # Try to create log directory, fallback to current directory if it fails
23
- try:
24
- log_dir = os.path.abspath(log_dir)
25
- os.makedirs(log_dir, exist_ok=True)
26
- except Exception:
27
- # Fallback to current directory if path invalid
28
- log_dir = os.getcwd()
29
-
30
- log_path = os.path.join(log_dir, "app.log")
31
-
32
- formatter = logging.Formatter(
33
- fmt="%(asctime)s %(levelname)s [%(name)s] %(message)s",
34
- datefmt="%Y-%m-%d %H:%M:%S",
35
- )
36
-
37
- # Console handler (always available)
38
- console_handler = logging.StreamHandler()
39
- console_handler.setFormatter(formatter)
40
- logger.addHandler(console_handler)
41
-
42
- # Try to add file handler, but don't fail if it doesn't work
43
- try:
44
- file_handler = RotatingFileHandler(log_path, maxBytes=1_000_000, backupCount=5, encoding="utf-8")
45
- file_handler.setFormatter(formatter)
46
- logger.addHandler(file_handler)
47
- logger.debug(f"File logging enabled: {log_path}")
48
- except (PermissionError, OSError) as e:
49
- # If file logging fails, just log to console
50
- logger.warning(f"File logging disabled due to permission error: {e}")
51
- logger.warning("Logging to console only")
52
-
53
- # Configure logger
54
- logger.setLevel(getattr(logging, log_level_str, logging.INFO))
55
- logger.propagate = False
56
-
57
- logger._configured = True # type: ignore[attr-defined]
58
- logger.debug(f"Logger initialized. Level={log_level_str}")
59
- return logger
60
-
61
-
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Centralized logging setup for the Transport Query Application.
4
+ Provides a rotating file handler and console output.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from logging.handlers import RotatingFileHandler
10
+
11
+
12
+ def get_logger(name: str) -> logging.Logger:
13
+ """Create or retrieve a configured logger with file and console handlers."""
14
+ logger = logging.getLogger(name)
15
+
16
+ if getattr(logger, "_configured", False):
17
+ return logger
18
+
19
+ log_level_str = os.getenv("LOG_LEVEL", "INFO").upper()
20
+ log_dir = os.getenv("LOG_DIR", os.path.join(os.path.dirname(__file__), "..", "logs"))
21
+
22
+ # Try to create log directory, fallback to current directory if it fails
23
+ try:
24
+ log_dir = os.path.abspath(log_dir)
25
+ os.makedirs(log_dir, exist_ok=True)
26
+ except Exception:
27
+ # Fallback to current directory if path invalid
28
+ log_dir = os.getcwd()
29
+
30
+ log_path = os.path.join(log_dir, "app.log")
31
+
32
+ formatter = logging.Formatter(
33
+ fmt="%(asctime)s %(levelname)s [%(name)s] %(message)s",
34
+ datefmt="%Y-%m-%d %H:%M:%S",
35
+ )
36
+
37
+ # Console handler (always available)
38
+ console_handler = logging.StreamHandler()
39
+ console_handler.setFormatter(formatter)
40
+ logger.addHandler(console_handler)
41
+
42
+ # Try to add file handler, but don't fail if it doesn't work
43
+ try:
44
+ file_handler = RotatingFileHandler(log_path, maxBytes=1_000_000, backupCount=5, encoding="utf-8")
45
+ file_handler.setFormatter(formatter)
46
+ logger.addHandler(file_handler)
47
+ logger.debug(f"File logging enabled: {log_path}")
48
+ except (PermissionError, OSError) as e:
49
+ # If file logging fails, just log to console
50
+ logger.warning(f"File logging disabled due to permission error: {e}")
51
+ logger.warning("Logging to console only")
52
+
53
+ # Configure logger
54
+ logger.setLevel(getattr(logging, log_level_str, logging.INFO))
55
+ logger.propagate = False
56
+
57
+ logger._configured = True # type: ignore[attr-defined]
58
+ logger.debug(f"Logger initialized. Level={log_level_str}")
59
+ return logger
60
+
61
+
neo4j_service.py CHANGED
@@ -1,222 +1,222 @@
1
- #!/usr/bin/env python3
2
- """
3
- Neo4j Service for Transport Query Application
4
- Handles all database operations
5
- """
6
-
7
- from neo4j import GraphDatabase
8
- from typing import List, Dict, Optional, Tuple
9
- from config import Config
10
-
11
- class Neo4jService:
12
- """Neo4j database service"""
13
-
14
- def __init__(self):
15
- self.config = Config()
16
- self.driver = None
17
- self._connect()
18
-
19
- def _connect(self):
20
- """Connect to Neo4j database"""
21
- try:
22
- self.driver = GraphDatabase.driver(
23
- self.config.NEO4J_URI,
24
- auth=(self.config.NEO4J_USER, self.config.NEO4J_PASSWORD)
25
- )
26
- # Test connection
27
- with self.driver.session() as session:
28
- session.run("RETURN 1")
29
- print("✅ Connected to Neo4j database")
30
- except Exception as e:
31
- print(f"❌ Failed to connect to Neo4j: {e}")
32
- self.driver = None
33
-
34
- def is_connected(self) -> bool:
35
- """Check if connected to Neo4j"""
36
- return self.driver is not None
37
-
38
- def get_fare(self, from_location: str, to_location: str) -> Optional[Dict]:
39
- """Get fare between two locations"""
40
- if not self.is_connected():
41
- return None
42
-
43
- try:
44
- with self.driver.session() as session:
45
- result = session.run("""
46
- MATCH (a:Place {name: $from_location})-[r:Fare]->(b:Place {name: $to_location})
47
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
48
- """, from_location=from_location, to_location=to_location)
49
-
50
- record = result.single()
51
- if record:
52
- return {
53
- 'from_place': record['from_place'],
54
- 'to_place': record['to_place'],
55
- 'fare': record['fare']
56
- }
57
- return None
58
-
59
- except Exception as e:
60
- print(f"Error getting fare: {e}")
61
- return None
62
-
63
- def get_all_places(self) -> List[str]:
64
- """Get all available places"""
65
- if not self.is_connected():
66
- return []
67
-
68
- try:
69
- with self.driver.session() as session:
70
- result = session.run("""
71
- MATCH (p:Place)
72
- RETURN DISTINCT p.name as place
73
- ORDER BY p.name
74
- """)
75
-
76
- return [record['place'] for record in result]
77
-
78
- except Exception as e:
79
- print(f"Error getting places: {e}")
80
- return []
81
-
82
- def get_routes_from_location(self, from_location: str) -> List[Dict]:
83
- """Get all routes from a specific location"""
84
- if not self.is_connected():
85
- return []
86
-
87
- try:
88
- with self.driver.session() as session:
89
- result = session.run("""
90
- MATCH (a:Place {name: $from_location})-[r:Fare]->(b:Place)
91
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
92
- ORDER BY r.fare
93
- """, from_location=from_location)
94
-
95
- return [dict(record) for record in result]
96
-
97
- except Exception as e:
98
- print(f"Error getting routes from location: {e}")
99
- return []
100
-
101
- def get_routes_to_location(self, to_location: str) -> List[Dict]:
102
- """Get all routes to a specific location"""
103
- if not self.is_connected():
104
- return []
105
-
106
- try:
107
- with self.driver.session() as session:
108
- result = session.run("""
109
- MATCH (a:Place)-[r:Fare]->(b:Place {name: $to_location})
110
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
111
- ORDER BY r.fare
112
- """, to_location=to_location)
113
-
114
- return [dict(record) for record in result]
115
-
116
- except Exception as e:
117
- print(f"Error getting routes to location: {e}")
118
- return []
119
-
120
- def get_cheapest_routes(self, limit: int = 10) -> List[Dict]:
121
- """Get cheapest routes"""
122
- if not self.is_connected():
123
- return []
124
-
125
- try:
126
- with self.driver.session() as session:
127
- result = session.run("""
128
- MATCH (a:Place)-[r:Fare]->(b:Place)
129
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
130
- ORDER BY r.fare ASC
131
- LIMIT $limit
132
- """, limit=limit)
133
-
134
- return [dict(record) for record in result]
135
-
136
- except Exception as e:
137
- print(f"Error getting cheapest routes: {e}")
138
- return []
139
-
140
- def get_most_expensive_routes(self, limit: int = 10) -> List[Dict]:
141
- """Get most expensive routes"""
142
- if not self.is_connected():
143
- return []
144
-
145
- try:
146
- with self.driver.session() as session:
147
- result = session.run("""
148
- MATCH (a:Place)-[r:Fare]->(b:Place)
149
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
150
- ORDER BY r.fare DESC
151
- LIMIT $limit
152
- """, limit=limit)
153
-
154
- return [dict(record) for record in result]
155
-
156
- except Exception as e:
157
- print(f"Error getting most expensive routes: {e}")
158
- return []
159
-
160
- def search_routes_by_fare_range(self, min_fare: float, max_fare: float) -> List[Dict]:
161
- """Search routes within a fare range"""
162
- if not self.is_connected():
163
- return []
164
-
165
- try:
166
- with self.driver.session() as session:
167
- result = session.run("""
168
- MATCH (a:Place)-[r:Fare]->(b:Place)
169
- WHERE r.fare >= $min_fare AND r.fare <= $max_fare
170
- RETURN a.name as from_place, b.name as to_place, r.fare as fare
171
- ORDER BY r.fare
172
- """, min_fare=min_fare, max_fare=max_fare)
173
-
174
- return [dict(record) for record in result]
175
-
176
- except Exception as e:
177
- print(f"Error searching routes by fare range: {e}")
178
- return []
179
-
180
- def get_route_statistics(self) -> Dict:
181
- """Get database statistics"""
182
- if not self.is_connected():
183
- return {}
184
-
185
- try:
186
- with self.driver.session() as session:
187
- # Count places
188
- places_result = session.run("MATCH (p:Place) RETURN count(p) as place_count")
189
- place_count = places_result.single()['place_count']
190
-
191
- # Count routes
192
- routes_result = session.run("MATCH ()-[r:Fare]->() RETURN count(r) as route_count")
193
- route_count = routes_result.single()['route_count']
194
-
195
- # Average fare
196
- avg_result = session.run("MATCH ()-[r:Fare]->() RETURN avg(r.fare) as avg_fare")
197
- avg_fare = avg_result.single()['avg_fare']
198
-
199
- # Min and max fares
200
- fare_range_result = session.run("""
201
- MATCH ()-[r:Fare]->()
202
- RETURN min(r.fare) as min_fare, max(r.fare) as max_fare
203
- """)
204
- fare_range = fare_range_result.single()
205
-
206
- return {
207
- 'total_places': place_count,
208
- 'total_routes': route_count,
209
- 'average_fare': round(avg_fare, 2) if avg_fare else 0,
210
- 'min_fare': fare_range['min_fare'],
211
- 'max_fare': fare_range['max_fare']
212
- }
213
-
214
- except Exception as e:
215
- print(f"Error getting statistics: {e}")
216
- return {}
217
-
218
- def close(self):
219
- """Close database connection"""
220
- if self.driver:
221
- self.driver.close()
222
- self.driver = None
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Neo4j Service for Transport Query Application
4
+ Handles all database operations
5
+ """
6
+
7
+ from neo4j import GraphDatabase
8
+ from typing import List, Dict, Optional, Tuple
9
+ from config import Config
10
+
11
+ class Neo4jService:
12
+ """Neo4j database service"""
13
+
14
+ def __init__(self):
15
+ self.config = Config()
16
+ self.driver = None
17
+ self._connect()
18
+
19
+ def _connect(self):
20
+ """Connect to Neo4j database"""
21
+ try:
22
+ self.driver = GraphDatabase.driver(
23
+ self.config.NEO4J_URI,
24
+ auth=(self.config.NEO4J_USER, self.config.NEO4J_PASSWORD)
25
+ )
26
+ # Test connection
27
+ with self.driver.session() as session:
28
+ session.run("RETURN 1")
29
+ print("✅ Connected to Neo4j database")
30
+ except Exception as e:
31
+ print(f"❌ Failed to connect to Neo4j: {e}")
32
+ self.driver = None
33
+
34
+ def is_connected(self) -> bool:
35
+ """Check if connected to Neo4j"""
36
+ return self.driver is not None
37
+
38
+ def get_fare(self, from_location: str, to_location: str) -> Optional[Dict]:
39
+ """Get fare between two locations"""
40
+ if not self.is_connected():
41
+ return None
42
+
43
+ try:
44
+ with self.driver.session() as session:
45
+ result = session.run("""
46
+ MATCH (a:Place {name: $from_location})-[r:Fare]->(b:Place {name: $to_location})
47
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
48
+ """, from_location=from_location, to_location=to_location)
49
+
50
+ record = result.single()
51
+ if record:
52
+ return {
53
+ 'from_place': record['from_place'],
54
+ 'to_place': record['to_place'],
55
+ 'fare': record['fare']
56
+ }
57
+ return None
58
+
59
+ except Exception as e:
60
+ print(f"Error getting fare: {e}")
61
+ return None
62
+
63
+ def get_all_places(self) -> List[str]:
64
+ """Get all available places"""
65
+ if not self.is_connected():
66
+ return []
67
+
68
+ try:
69
+ with self.driver.session() as session:
70
+ result = session.run("""
71
+ MATCH (p:Place)
72
+ RETURN DISTINCT p.name as place
73
+ ORDER BY p.name
74
+ """)
75
+
76
+ return [record['place'] for record in result]
77
+
78
+ except Exception as e:
79
+ print(f"Error getting places: {e}")
80
+ return []
81
+
82
+ def get_routes_from_location(self, from_location: str) -> List[Dict]:
83
+ """Get all routes from a specific location"""
84
+ if not self.is_connected():
85
+ return []
86
+
87
+ try:
88
+ with self.driver.session() as session:
89
+ result = session.run("""
90
+ MATCH (a:Place {name: $from_location})-[r:Fare]->(b:Place)
91
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
92
+ ORDER BY r.fare
93
+ """, from_location=from_location)
94
+
95
+ return [dict(record) for record in result]
96
+
97
+ except Exception as e:
98
+ print(f"Error getting routes from location: {e}")
99
+ return []
100
+
101
+ def get_routes_to_location(self, to_location: str) -> List[Dict]:
102
+ """Get all routes to a specific location"""
103
+ if not self.is_connected():
104
+ return []
105
+
106
+ try:
107
+ with self.driver.session() as session:
108
+ result = session.run("""
109
+ MATCH (a:Place)-[r:Fare]->(b:Place {name: $to_location})
110
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
111
+ ORDER BY r.fare
112
+ """, to_location=to_location)
113
+
114
+ return [dict(record) for record in result]
115
+
116
+ except Exception as e:
117
+ print(f"Error getting routes to location: {e}")
118
+ return []
119
+
120
+ def get_cheapest_routes(self, limit: int = 10) -> List[Dict]:
121
+ """Get cheapest routes"""
122
+ if not self.is_connected():
123
+ return []
124
+
125
+ try:
126
+ with self.driver.session() as session:
127
+ result = session.run("""
128
+ MATCH (a:Place)-[r:Fare]->(b:Place)
129
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
130
+ ORDER BY r.fare ASC
131
+ LIMIT $limit
132
+ """, limit=limit)
133
+
134
+ return [dict(record) for record in result]
135
+
136
+ except Exception as e:
137
+ print(f"Error getting cheapest routes: {e}")
138
+ return []
139
+
140
+ def get_most_expensive_routes(self, limit: int = 10) -> List[Dict]:
141
+ """Get most expensive routes"""
142
+ if not self.is_connected():
143
+ return []
144
+
145
+ try:
146
+ with self.driver.session() as session:
147
+ result = session.run("""
148
+ MATCH (a:Place)-[r:Fare]->(b:Place)
149
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
150
+ ORDER BY r.fare DESC
151
+ LIMIT $limit
152
+ """, limit=limit)
153
+
154
+ return [dict(record) for record in result]
155
+
156
+ except Exception as e:
157
+ print(f"Error getting most expensive routes: {e}")
158
+ return []
159
+
160
+ def search_routes_by_fare_range(self, min_fare: float, max_fare: float) -> List[Dict]:
161
+ """Search routes within a fare range"""
162
+ if not self.is_connected():
163
+ return []
164
+
165
+ try:
166
+ with self.driver.session() as session:
167
+ result = session.run("""
168
+ MATCH (a:Place)-[r:Fare]->(b:Place)
169
+ WHERE r.fare >= $min_fare AND r.fare <= $max_fare
170
+ RETURN a.name as from_place, b.name as to_place, r.fare as fare
171
+ ORDER BY r.fare
172
+ """, min_fare=min_fare, max_fare=max_fare)
173
+
174
+ return [dict(record) for record in result]
175
+
176
+ except Exception as e:
177
+ print(f"Error searching routes by fare range: {e}")
178
+ return []
179
+
180
+ def get_route_statistics(self) -> Dict:
181
+ """Get database statistics"""
182
+ if not self.is_connected():
183
+ return {}
184
+
185
+ try:
186
+ with self.driver.session() as session:
187
+ # Count places
188
+ places_result = session.run("MATCH (p:Place) RETURN count(p) as place_count")
189
+ place_count = places_result.single()['place_count']
190
+
191
+ # Count routes
192
+ routes_result = session.run("MATCH ()-[r:Fare]->() RETURN count(r) as route_count")
193
+ route_count = routes_result.single()['route_count']
194
+
195
+ # Average fare
196
+ avg_result = session.run("MATCH ()-[r:Fare]->() RETURN avg(r.fare) as avg_fare")
197
+ avg_fare = avg_result.single()['avg_fare']
198
+
199
+ # Min and max fares
200
+ fare_range_result = session.run("""
201
+ MATCH ()-[r:Fare]->()
202
+ RETURN min(r.fare) as min_fare, max(r.fare) as max_fare
203
+ """)
204
+ fare_range = fare_range_result.single()
205
+
206
+ return {
207
+ 'total_places': place_count,
208
+ 'total_routes': route_count,
209
+ 'average_fare': round(avg_fare, 2) if avg_fare else 0,
210
+ 'min_fare': fare_range['min_fare'],
211
+ 'max_fare': fare_range['max_fare']
212
+ }
213
+
214
+ except Exception as e:
215
+ print(f"Error getting statistics: {e}")
216
+ return {}
217
+
218
+ def close(self):
219
+ """Close database connection"""
220
+ if self.driver:
221
+ self.driver.close()
222
+ self.driver = None
spell_corrector.py CHANGED
@@ -1,257 +1,257 @@
1
- #!/usr/bin/env python3
2
- """
3
- Spell Correction Module for Transport Query Application
4
- Handles location name corrections using fuzzy matching and LLM
5
- """
6
-
7
- import re
8
- from fuzzywuzzy import fuzz
9
- from typing import List, Tuple, Optional
10
- import openai
11
- from config import Config
12
-
13
- class SpellCorrector:
14
- """Spell correction for location names"""
15
-
16
- def __init__(self):
17
- self.config = Config()
18
- self.location_mapping = self.config.LOCATION_MAPPING
19
- self.available_locations = set(self.location_mapping.values())
20
-
21
- # Initialize OpenAI if API key is available
22
- if self.config.OPENAI_API_KEY:
23
- try:
24
- # Prefer new SDK client if installed; otherwise set legacy api key
25
- try:
26
- from openai import OpenAI # noqa: F401
27
- self.llm_available = True
28
- except Exception:
29
- openai.api_key = self.config.OPENAI_API_KEY
30
- self.llm_available = True
31
- except Exception:
32
- self.llm_available = False
33
- else:
34
- self.llm_available = False
35
-
36
- def correct_location(self, location: str) -> Tuple[str, float, str]:
37
- """
38
- Correct a location name using multiple methods
39
-
40
- Returns:
41
- Tuple of (corrected_name, confidence_score, correction_method)
42
- """
43
- location = location.strip().lower()
44
-
45
- # Method 1: Direct mapping
46
- if location in self.location_mapping:
47
- corrected = self.location_mapping[location]
48
- return corrected, 1.0, "direct_mapping"
49
-
50
- # Method 2: Fuzzy matching
51
- best_match, confidence = self._fuzzy_match(location)
52
- if confidence >= self.config.SIMILARITY_THRESHOLD:
53
- return best_match, confidence, "fuzzy_matching"
54
-
55
- # Method 3: LLM correction (if available)
56
- if self.llm_available:
57
- llm_corrected = self._llm_correct(location)
58
- if llm_corrected:
59
- # Verify LLM suggestion with fuzzy matching
60
- llm_confidence = fuzz.ratio(location.lower(), llm_corrected.lower()) / 100
61
- if llm_confidence >= 0.6: # Lower threshold for LLM suggestions
62
- return llm_corrected, llm_confidence, "llm_correction"
63
-
64
- # Method 4: Partial matching
65
- partial_match = self._partial_match(location)
66
- if partial_match:
67
- return partial_match, 0.7, "partial_matching"
68
-
69
- # No correction found
70
- return location.title(), 0.0, "no_correction"
71
-
72
- def _fuzzy_match(self, location: str) -> Tuple[str, float]:
73
- """Find best fuzzy match for location"""
74
- best_match = None
75
- best_score = 0
76
-
77
- for available_location in self.available_locations:
78
- score = fuzz.ratio(location.lower(), available_location.lower()) / 100
79
- if score > best_score:
80
- best_score = score
81
- best_match = available_location
82
-
83
- return best_match, best_score
84
-
85
- def _partial_match(self, location: str) -> Optional[str]:
86
- """Find partial matches (substring matching)"""
87
- location_lower = location.lower()
88
-
89
- for available_location in self.available_locations:
90
- available_lower = available_location.lower()
91
-
92
- # Check if location is contained in available location
93
- if location_lower in available_lower or available_lower in location_lower:
94
- return available_location
95
-
96
- return None
97
-
98
- def _llm_correct(self, location: str) -> Optional[str]:
99
- """Use LLM to correct location name"""
100
- try:
101
- prompt = f"""
102
- You are a location name correction system for Sri Lankan cities and towns.
103
- Given a potentially misspelled location name, return the correct spelling.
104
-
105
- Available locations include: {', '.join(sorted(self.available_locations))}
106
-
107
- Input location: "{location}"
108
-
109
- Return only the corrected location name, nothing else. If no correction is possible, return "UNKNOWN".
110
- """
111
-
112
- corrected = None
113
- # Try new SDK first
114
- try:
115
- from openai import OpenAI
116
- client = OpenAI(api_key=self.config.OPENAI_API_KEY)
117
- response = client.chat.completions.create(
118
- model=self.config.OPENAI_MODEL,
119
- messages=[
120
- {"role": "system", "content": "You are a helpful assistant that corrects location names."},
121
- {"role": "user", "content": prompt}
122
- ],
123
- max_tokens=50,
124
- temperature=0.1
125
- )
126
- corrected = response.choices[0].message.content.strip()
127
- except Exception as sdk_err:
128
- # Fallback to legacy API if present
129
- import openai
130
- try:
131
- openai.api_key = self.config.OPENAI_API_KEY
132
- response = openai.ChatCompletion.create(
133
- model=self.config.OPENAI_MODEL,
134
- messages=[
135
- {"role": "system", "content": "You are a helpful assistant that corrects location names."},
136
- {"role": "user", "content": prompt}
137
- ],
138
- max_tokens=50,
139
- temperature=0.1
140
- )
141
- corrected = response.choices[0].message.content.strip()
142
- except Exception:
143
- raise sdk_err
144
-
145
- # Validate LLM response
146
- if corrected.upper() == "UNKNOWN":
147
- return None
148
-
149
- # Check if corrected location exists in our database
150
- if corrected in self.available_locations:
151
- return corrected
152
-
153
- # Try fuzzy matching on LLM response
154
- llm_fuzzy_match, confidence = self._fuzzy_match(corrected)
155
- if confidence >= 0.8:
156
- return llm_fuzzy_match
157
-
158
- return None
159
-
160
- except Exception as e:
161
- print(f"LLM correction error: {e}")
162
- return None
163
-
164
- def extract_locations_from_query(self, query: str) -> List[Tuple[str, str, float, str]]:
165
- """
166
- Extract and correct locations from a natural language query
167
-
168
- Returns:
169
- List of tuples: (original, corrected, confidence, method)
170
- """
171
- # Common patterns for location extraction
172
- patterns = [
173
- r'from\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
174
- r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
175
- r'between\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+and\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
176
- r'fare\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
177
- r'price\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
178
- r'cost\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
179
- r'how\s+much\s+(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
180
- r'what\s+(?:is|are)\s+(?:the\s+)?(?:fare|price|cost)s?\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
181
- r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+(?:fare|price|cost)(?:\s|$|\?)',
182
- r'(?:fare|price|cost)\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)'
183
- ]
184
-
185
- locations = []
186
-
187
- # Try all patterns to find locations
188
- for pattern in patterns:
189
- match = re.search(pattern, query, re.IGNORECASE)
190
- if match:
191
- # Extract locations from the match
192
- groups = match.groups()
193
- if len(groups) >= 2:
194
- from_location = groups[0].strip()
195
- to_location = groups[1].strip()
196
-
197
- # Skip if locations are too short or common words
198
- if len(from_location) >= 2 and from_location.lower() not in ['to', 'from', 'and', 'the', 'a', 'an']:
199
- from_corrected, from_confidence, from_method = self.correct_location(from_location)
200
- if from_confidence > 0.5:
201
- locations.append((
202
- from_location,
203
- from_corrected,
204
- from_confidence,
205
- from_method
206
- ))
207
-
208
- if len(to_location) >= 2 and to_location.lower() not in ['to', 'from', 'and', 'the', 'a', 'an']:
209
- to_corrected, to_confidence, to_method = self.correct_location(to_location)
210
- if to_confidence > 0.5:
211
- locations.append((
212
- to_location,
213
- to_corrected,
214
- to_confidence,
215
- to_method
216
- ))
217
-
218
- # If we found locations, break to avoid duplicates
219
- if len(locations) >= 2:
220
- break
221
-
222
- return locations
223
-
224
- def get_suggestions(self, partial_location: str) -> List[Tuple[str, float]]:
225
- """Get location suggestions for autocomplete"""
226
- suggestions = []
227
- partial_lower = partial_location.lower()
228
-
229
- for location in self.available_locations:
230
- location_lower = location.lower()
231
-
232
- # Check if partial location is a prefix
233
- if location_lower.startswith(partial_lower):
234
- suggestions.append((location, 1.0))
235
- # Check fuzzy similarity
236
- elif fuzz.ratio(partial_lower, location_lower) / 100 >= 0.6:
237
- suggestions.append((location, fuzz.ratio(partial_lower, location_lower) / 100))
238
-
239
- # Sort by confidence and return top suggestions
240
- suggestions.sort(key=lambda x: x[1], reverse=True)
241
- return suggestions[:self.config.MAX_SUGGESTIONS]
242
-
243
- def validate_route(self, from_location: str, to_location: str) -> Tuple[bool, str]:
244
- """Validate if a route exists in the database"""
245
- from_corrected, from_confidence, _ = self.correct_location(from_location)
246
- to_corrected, to_confidence, _ = self.correct_location(to_location)
247
-
248
- if from_confidence < 0.5:
249
- return False, f"Could not identify departure location: '{from_location}'"
250
-
251
- if to_confidence < 0.5:
252
- return False, f"Could not identify destination location: '{to_location}'"
253
-
254
- if from_corrected == to_corrected:
255
- return False, f"Departure and destination cannot be the same: '{from_corrected}'"
256
-
257
- return True, f"Route: {from_corrected} → {to_corrected}"
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Spell Correction Module for Transport Query Application
4
+ Handles location name corrections using fuzzy matching and LLM
5
+ """
6
+
7
+ import re
8
+ from fuzzywuzzy import fuzz
9
+ from typing import List, Tuple, Optional
10
+ import openai
11
+ from config import Config
12
+
13
+ class SpellCorrector:
14
+ """Spell correction for location names"""
15
+
16
+ def __init__(self):
17
+ self.config = Config()
18
+ self.location_mapping = self.config.LOCATION_MAPPING
19
+ self.available_locations = set(self.location_mapping.values())
20
+
21
+ # Initialize OpenAI if API key is available
22
+ if self.config.OPENAI_API_KEY:
23
+ try:
24
+ # Prefer new SDK client if installed; otherwise set legacy api key
25
+ try:
26
+ from openai import OpenAI # noqa: F401
27
+ self.llm_available = True
28
+ except Exception:
29
+ openai.api_key = self.config.OPENAI_API_KEY
30
+ self.llm_available = True
31
+ except Exception:
32
+ self.llm_available = False
33
+ else:
34
+ self.llm_available = False
35
+
36
+ def correct_location(self, location: str) -> Tuple[str, float, str]:
37
+ """
38
+ Correct a location name using multiple methods
39
+
40
+ Returns:
41
+ Tuple of (corrected_name, confidence_score, correction_method)
42
+ """
43
+ location = location.strip().lower()
44
+
45
+ # Method 1: Direct mapping
46
+ if location in self.location_mapping:
47
+ corrected = self.location_mapping[location]
48
+ return corrected, 1.0, "direct_mapping"
49
+
50
+ # Method 2: Fuzzy matching
51
+ best_match, confidence = self._fuzzy_match(location)
52
+ if confidence >= self.config.SIMILARITY_THRESHOLD:
53
+ return best_match, confidence, "fuzzy_matching"
54
+
55
+ # Method 3: LLM correction (if available)
56
+ if self.llm_available:
57
+ llm_corrected = self._llm_correct(location)
58
+ if llm_corrected:
59
+ # Verify LLM suggestion with fuzzy matching
60
+ llm_confidence = fuzz.ratio(location.lower(), llm_corrected.lower()) / 100
61
+ if llm_confidence >= 0.6: # Lower threshold for LLM suggestions
62
+ return llm_corrected, llm_confidence, "llm_correction"
63
+
64
+ # Method 4: Partial matching
65
+ partial_match = self._partial_match(location)
66
+ if partial_match:
67
+ return partial_match, 0.7, "partial_matching"
68
+
69
+ # No correction found
70
+ return location.title(), 0.0, "no_correction"
71
+
72
+ def _fuzzy_match(self, location: str) -> Tuple[str, float]:
73
+ """Find best fuzzy match for location"""
74
+ best_match = None
75
+ best_score = 0
76
+
77
+ for available_location in self.available_locations:
78
+ score = fuzz.ratio(location.lower(), available_location.lower()) / 100
79
+ if score > best_score:
80
+ best_score = score
81
+ best_match = available_location
82
+
83
+ return best_match, best_score
84
+
85
+ def _partial_match(self, location: str) -> Optional[str]:
86
+ """Find partial matches (substring matching)"""
87
+ location_lower = location.lower()
88
+
89
+ for available_location in self.available_locations:
90
+ available_lower = available_location.lower()
91
+
92
+ # Check if location is contained in available location
93
+ if location_lower in available_lower or available_lower in location_lower:
94
+ return available_location
95
+
96
+ return None
97
+
98
+ def _llm_correct(self, location: str) -> Optional[str]:
99
+ """Use LLM to correct location name"""
100
+ try:
101
+ prompt = f"""
102
+ You are a location name correction system for Sri Lankan cities and towns.
103
+ Given a potentially misspelled location name, return the correct spelling.
104
+
105
+ Available locations include: {', '.join(sorted(self.available_locations))}
106
+
107
+ Input location: "{location}"
108
+
109
+ Return only the corrected location name, nothing else. If no correction is possible, return "UNKNOWN".
110
+ """
111
+
112
+ corrected = None
113
+ # Try new SDK first
114
+ try:
115
+ from openai import OpenAI
116
+ client = OpenAI(api_key=self.config.OPENAI_API_KEY)
117
+ response = client.chat.completions.create(
118
+ model=self.config.OPENAI_MODEL,
119
+ messages=[
120
+ {"role": "system", "content": "You are a helpful assistant that corrects location names."},
121
+ {"role": "user", "content": prompt}
122
+ ],
123
+ max_tokens=50,
124
+ temperature=0.1
125
+ )
126
+ corrected = response.choices[0].message.content.strip()
127
+ except Exception as sdk_err:
128
+ # Fallback to legacy API if present
129
+ import openai
130
+ try:
131
+ openai.api_key = self.config.OPENAI_API_KEY
132
+ response = openai.ChatCompletion.create(
133
+ model=self.config.OPENAI_MODEL,
134
+ messages=[
135
+ {"role": "system", "content": "You are a helpful assistant that corrects location names."},
136
+ {"role": "user", "content": prompt}
137
+ ],
138
+ max_tokens=50,
139
+ temperature=0.1
140
+ )
141
+ corrected = response.choices[0].message.content.strip()
142
+ except Exception:
143
+ raise sdk_err
144
+
145
+ # Validate LLM response
146
+ if corrected.upper() == "UNKNOWN":
147
+ return None
148
+
149
+ # Check if corrected location exists in our database
150
+ if corrected in self.available_locations:
151
+ return corrected
152
+
153
+ # Try fuzzy matching on LLM response
154
+ llm_fuzzy_match, confidence = self._fuzzy_match(corrected)
155
+ if confidence >= 0.8:
156
+ return llm_fuzzy_match
157
+
158
+ return None
159
+
160
+ except Exception as e:
161
+ print(f"LLM correction error: {e}")
162
+ return None
163
+
164
+ def extract_locations_from_query(self, query: str) -> List[Tuple[str, str, float, str]]:
165
+ """
166
+ Extract and correct locations from a natural language query
167
+
168
+ Returns:
169
+ List of tuples: (original, corrected, confidence, method)
170
+ """
171
+ # Common patterns for location extraction
172
+ patterns = [
173
+ r'from\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
174
+ r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
175
+ r'between\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+and\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
176
+ r'fare\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
177
+ r'price\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
178
+ r'cost\s+(?:of|from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
179
+ r'how\s+much\s+(?:is|does)\s+(?:the\s+)?(?:fare|price|cost)\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
180
+ r'what\s+(?:is|are)\s+(?:the\s+)?(?:fare|price|cost)s?\s+(?:from\s+)?([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)',
181
+ r'([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+(?:fare|price|cost)(?:\s|$|\?)',
182
+ r'(?:fare|price|cost)\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+to\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)(?:\s|$|\?)'
183
+ ]
184
+
185
+ locations = []
186
+
187
+ # Try all patterns to find locations
188
+ for pattern in patterns:
189
+ match = re.search(pattern, query, re.IGNORECASE)
190
+ if match:
191
+ # Extract locations from the match
192
+ groups = match.groups()
193
+ if len(groups) >= 2:
194
+ from_location = groups[0].strip()
195
+ to_location = groups[1].strip()
196
+
197
+ # Skip if locations are too short or common words
198
+ if len(from_location) >= 2 and from_location.lower() not in ['to', 'from', 'and', 'the', 'a', 'an']:
199
+ from_corrected, from_confidence, from_method = self.correct_location(from_location)
200
+ if from_confidence > 0.5:
201
+ locations.append((
202
+ from_location,
203
+ from_corrected,
204
+ from_confidence,
205
+ from_method
206
+ ))
207
+
208
+ if len(to_location) >= 2 and to_location.lower() not in ['to', 'from', 'and', 'the', 'a', 'an']:
209
+ to_corrected, to_confidence, to_method = self.correct_location(to_location)
210
+ if to_confidence > 0.5:
211
+ locations.append((
212
+ to_location,
213
+ to_corrected,
214
+ to_confidence,
215
+ to_method
216
+ ))
217
+
218
+ # If we found locations, break to avoid duplicates
219
+ if len(locations) >= 2:
220
+ break
221
+
222
+ return locations
223
+
224
+ def get_suggestions(self, partial_location: str) -> List[Tuple[str, float]]:
225
+ """Get location suggestions for autocomplete"""
226
+ suggestions = []
227
+ partial_lower = partial_location.lower()
228
+
229
+ for location in self.available_locations:
230
+ location_lower = location.lower()
231
+
232
+ # Check if partial location is a prefix
233
+ if location_lower.startswith(partial_lower):
234
+ suggestions.append((location, 1.0))
235
+ # Check fuzzy similarity
236
+ elif fuzz.ratio(partial_lower, location_lower) / 100 >= 0.6:
237
+ suggestions.append((location, fuzz.ratio(partial_lower, location_lower) / 100))
238
+
239
+ # Sort by confidence and return top suggestions
240
+ suggestions.sort(key=lambda x: x[1], reverse=True)
241
+ return suggestions[:self.config.MAX_SUGGESTIONS]
242
+
243
+ def validate_route(self, from_location: str, to_location: str) -> Tuple[bool, str]:
244
+ """Validate if a route exists in the database"""
245
+ from_corrected, from_confidence, _ = self.correct_location(from_location)
246
+ to_corrected, to_confidence, _ = self.correct_location(to_location)
247
+
248
+ if from_confidence < 0.5:
249
+ return False, f"Could not identify departure location: '{from_location}'"
250
+
251
+ if to_confidence < 0.5:
252
+ return False, f"Could not identify destination location: '{to_location}'"
253
+
254
+ if from_corrected == to_corrected:
255
+ return False, f"Departure and destination cannot be the same: '{from_corrected}'"
256
+
257
+ return True, f"Route: {from_corrected} → {to_corrected}"
translation_service.py CHANGED
@@ -1,702 +1,1057 @@
1
- #!/usr/bin/env python3
2
- """
3
- Translation Service for Sinhala-English Translation
4
- Handles translation of queries and responses with multiple free alternatives
5
- """
6
-
7
- import requests
8
- import json
9
- import re
10
- import openai
11
- from typing import Dict, Any, Optional
12
- from config import Config
13
- from logger import get_logger
14
-
15
- class TranslationService:
16
- def __init__(self):
17
- self.config = Config()
18
- self.openai_api_key = getattr(self.config, 'OPENAI_API_KEY', None)
19
- self.logger = get_logger(self.__class__.__name__)
20
- # Controls
21
- import os
22
- self.use_pattern_translation = os.getenv('USE_PATTERN_TRANSLATION', 'false').lower() == 'true'
23
- self.force_llm_translation = os.getenv('FORCE_LLM_TRANSLATION', 'false').lower() == 'true'
24
- self.last_translation_method: Optional[str] = None
25
-
26
- # Free translation APIs
27
- self.libre_translate_url = "https://libretranslate.de/translate" # Free public instance
28
- self.mymemory_url = "https://api.mymemory.translated.net/get"
29
-
30
- # Common transport terms in Sinhala and their English equivalents
31
- self.transport_terms = {
32
- # Fare related
33
- 'කීයද': 'how much',
34
- 'මිල': 'price',
35
- 'වාරික': 'fare',
36
- 'වාරිකය': 'fare',
37
- 'වාරිකව': 'fare',
38
- 'ගාස්තු': 'fare',
39
- 'ගාස්තුව': 'fare',
40
- 'ප්‍රවාහන ගාස්තු': 'transport fare',
41
- 'බස් ගාස්තු': 'bus fare',
42
- 'බස් ගාස්තුව': 'bus fare',
43
- 'රේල් ගාස්තු': 'train fare',
44
- 'රේල් ගාස්තුව': 'train fare',
45
-
46
- # Locations
47
- 'කොළඹ': 'Colombo',
48
- 'මහනුවර': 'Kandy',
49
- 'මහනුවරට': 'Kandy',
50
- 'ගාල්ල': 'Galle',
51
- 'ගාල්ලට': 'Galle',
52
- 'මාතර': 'Matara',
53
- 'මාතරට': 'Matara',
54
- 'අනුරාධපුර': 'Anuradhapura',
55
- 'අනුරාධපුරට': 'Anuradhapura',
56
- 'පානදුර': 'Panadura',
57
- 'පානදුරට': 'Panadura',
58
- 'අලුත්ගම': 'Aluthgama',
59
- 'අලුත්ගමට': 'Aluthgama',
60
- 'නුගේගොඩ': 'Nugegoda',
61
- 'නුගේගොඩට': 'Nugegoda',
62
- 'දෙහිවල': 'Dehiwala',
63
- 'දෙහිවලට': 'Dehiwala',
64
- 'මොරටුව': 'Moratuwa',
65
- 'මොරටුවට': 'Moratuwa',
66
-
67
- # Direction words
68
- 'වලින්': 'from',
69
- 'වල': 'from',
70
- '': 'to',
71
- 'වෙත': 'to',
72
- 'සිට': 'from',
73
- 'දක්වා': 'to',
74
- 'සි': 'from',
75
-
76
- # Question words
77
- 'කොහෙද': 'where',
78
- 'කවදාද': 'when',
79
- 'කොහොමද': 'how',
80
- 'මොනවාද': 'what',
81
- 'කවුද': 'who',
82
-
83
- # Comparison words
84
- 'සමඟ': 'with',
85
- 'සහ': 'and',
86
- 'හෝ': 'or',
87
- 'වඩා': 'more',
88
- 'අඩු': 'less',
89
- 'සමාන': 'same',
90
- 'වෙනස': 'different',
91
- 'සසඳන්න': 'compare',
92
- 'සසඳන': 'compare',
93
-
94
- # Time words
95
- 'දැන්': 'now',
96
- 'අද': 'today',
97
- 'හෙට': 'tomorrow',
98
- 'ඊයේ': 'yesterday',
99
-
100
- # Common verbs
101
- 'යන්න': 'go',
102
- 'යන': 'go',
103
- 'එන්න': 'come',
104
- 'බලන්න': 'see',
105
- 'දැනගන්න': 'know',
106
- 'සොයන්න': 'find',
107
- 'සොයන': 'find',
108
- 'ඉගෙනගන්න': 'learn',
109
- 'නිර්දේශ': 'recommend',
110
- 'නිර්දේශ කරන්න': 'recommend',
111
- 'පෙන්වන්න': 'show',
112
- 'පෙන්වන': 'show',
113
-
114
- # Numbers and currency
115
- 'රුපියල්': 'rupees',
116
- 'රු': 'rupees',
117
- 'රුපියල': 'rupees',
118
-
119
- # Common phrases
120
- 'අතර': 'between',
121
- 'සහිත': 'with',
122
- 'මාර්ග': 'routes',
123
- 'මාර්ගවල': 'routes',
124
- 'ගමනාන්ත': 'destinations',
125
- 'ප්‍රසිද්ධ': 'popular',
126
- 'සාමාන්‍ය': 'average',
127
- 'සාමාන්‍යය': 'average',
128
- 'දත්ත': 'data',
129
- 'සංඛ්‍යාල���ඛන': 'statistics'
130
- }
131
-
132
- # Sinhala script detection pattern
133
- self.sinhala_pattern = re.compile(r'[\u0D80-\u0DFF]')
134
-
135
- def is_sinhala_text(self, text: str) -> bool:
136
- """Check if text contains Sinhala characters"""
137
- detected = bool(self.sinhala_pattern.search(text))
138
- self.logger.debug(f"Sinhala detection: detected={detected}, text='{text}'")
139
- return detected
140
-
141
- def _map_sinhala_place(self, text: str) -> str:
142
- """Map a Sinhala place token to its English equivalent using known terms and suffix stripping."""
143
- candidate = text.strip()
144
- # Direct map
145
- if candidate in self.transport_terms:
146
- return self.transport_terms[candidate]
147
- # Strip common Sinhala case particles/suffixes and try again
148
- base = re.sub(r'(ට|වෙත|දක්වා|වලින්|වල|සිට)$', '', candidate)
149
- if base in self.transport_terms:
150
- return self.transport_terms[base]
151
- return candidate
152
-
153
- def _parse_sinhala_fare_query(self, query: str) -> Optional[str]:
154
- """Detect simple Sinhala fare queries and build a clean English query.
155
- Example handled: "කොළඹ සිට මහනුවරට ගාස්තුව කීයද?" -> "What is the fare from Colombo to Kandy?"
156
- """
157
- try:
158
- # Quick check for fare-related tokens to avoid false positives
159
- if not any(tok in query for tok in ['ගාස්තු', 'ගාස්තුව', 'වාරික', 'වාරිකය', 'මිල']):
160
- return None
161
- # Extract source and destination around Sinhala "from" and "to" particles
162
- m = re.search(r'([\u0D80-\u0DFF\s]+?)\s*සිට\s*([\u0D80-\u0DFF\s]+?)(?:ට|වෙත|දක්වා)', query)
163
- if not m:
164
- return None
165
- src_si = m.group(1).strip()
166
- dst_si = m.group(2).strip()
167
- src_en = self._map_sinhala_place(src_si)
168
- dst_en = self._map_sinhala_place(dst_si)
169
- return f"What is the fare from {src_en} to {dst_en}?"
170
- except Exception:
171
- return None
172
-
173
- def translate_with_llm(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
174
- """Translate using OpenAI LLM (new SDK). Preserve original intent (comparison, lists, conjunctions)."""
175
- if not self.openai_api_key:
176
- return None
177
-
178
- try:
179
- # Determine source language
180
- if source_lang == 'auto':
181
- source_lang = 'si' if self.is_sinhala_text(text) else 'en'
182
-
183
- # Create language mapping
184
- lang_map = {
185
- ('si', 'en'): 'Sinhala to English',
186
- ('en', 'si'): 'English to Sinhala'
187
- }
188
-
189
- direction = lang_map.get((source_lang, target_lang))
190
- if not direction:
191
- return None
192
-
193
- prompt = f"""
194
- Translate the following text from {direction}.
195
- Output only the translated text without quotes or extra commentary.
196
- Critically: Preserve the original intent and structure. Do not simplify.
197
- - If it is a comparison (e.g., includes "සසඳා බලන්න"/"සසඳන්න"), translate as a comparison (e.g., "Compare ...").
198
- - Preserve conjunctions like "සහ" as "and" and keep all mentioned routes.
199
- - Keep direction words ("සිට" = from, "ට/වෙත/දක්වා" = to) and render routes fully.
200
- Use standard English city names:
201
- - මහනුවර = Kandy (not Mahanuwara)
202
- - කොළඹ = Colombo
203
- - ගාල්ල = Galle
204
- - මාතර = Matara
205
- - අනුරාධපුර = Anuradhapura
206
-
207
- Text to translate: {text}
208
- """
209
-
210
- # Build few-shot examples to preserve comparison/imperative structure
211
- examples = [
212
- (
213
- "කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?",
214
- "What is the bus fare from Colombo to Kandy?"
215
- ),
216
- (
217
- "කොළඹ සිට ගාල්ල දක්වා ටිකට් මිල කීයද?",
218
- "What is the ticket price from Colombo to Galle?"
219
- ),
220
- (
221
- "කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සසඳා බලන්න.",
222
- "Compare fares from Colombo to Panadura and from Colombo to Galle."
223
- ),
224
- (
225
- "රුපියල් 500 අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.",
226
- "Show routes with fares under 500 rupees."
227
- ),
228
- (
229
- "අඩු මිලේ මාර්ග නිර්දේශ කරන්න.",
230
- "Recommend cheap routes."
231
- ),
232
- ]
233
-
234
- # Compose messages with few-shot conditioning
235
- def build_messages(txt: str):
236
- msgs = [
237
- {
238
- "role": "system",
239
- "content": (
240
- "You are a professional translator. Translate accurately and naturally. "
241
- "Preserve imperative/comparative intent and list structure. Do not paraphrase. "
242
- "Return only the English translation without quotes. "
243
- "Canonical phrasing rules (use exactly): \n"
244
- "- Use 'Compare' for comparison requests.\n"
245
- "- Use 'Show' for requests like 'පෙන්වන්න' (do not use Provide/List).\n"
246
- "- Use 'How much is the' for 'කීයද' fare/price questions.\n"
247
- "- Use 'cheap' (not 'affordable').\n"
248
- "- Use 'under' (not 'below') for '< value'.\n"
249
- ),
250
- },
251
- {
252
- "role": "user",
253
- "content": (
254
- "Instructions: Preserve structure. Use 'Compare' for 'සසඳ', use 'from' for 'සිට' and 'to' for 'ට/වෙත/දක්වා'.\n"
255
- "Use exact place names: මහනුවර=Kandy, කොළඹ=Colombo, ගාල්ල=Galle, මාතර=Matara, අනුරාධපුර=Anuradhapura."
256
- ),
257
- },
258
- ]
259
- for si, en in examples:
260
- msgs.append({"role": "user", "content": f"Sinhala: {si}\nEnglish:"})
261
- msgs.append({"role": "assistant", "content": en})
262
- msgs.append({"role": "user", "content": f"Sinhala: {txt}\nEnglish:"})
263
- return msgs
264
-
265
- # Use new OpenAI SDK
266
- try:
267
- from openai import OpenAI
268
- client = OpenAI(api_key=self.openai_api_key)
269
- response = client.chat.completions.create(
270
- model="gpt-3.5-turbo",
271
- max_tokens=150,
272
- temperature=0.3,
273
- messages=build_messages(text)
274
- )
275
- translated = response.choices[0].message.content.strip()
276
- self.last_translation_method = 'llm'
277
- except Exception as sdk_err:
278
- # Fallback to legacy API if available
279
- import openai
280
- try:
281
- openai.api_key = self.openai_api_key
282
- response = openai.ChatCompletion.create(
283
- model="gpt-3.5-turbo",
284
- max_tokens=150,
285
- temperature=0.3,
286
- messages=build_messages(text)
287
- )
288
- translated = response.choices[0].message.content.strip()
289
- self.last_translation_method = 'llm'
290
- except Exception:
291
- raise sdk_err
292
-
293
- if translated.startswith('"') and translated.endswith('"'):
294
- translated = translated[1:-1]
295
- return translated if translated else None
296
- except Exception as e:
297
- self.logger.warning(f"LLM translation error: {e}")
298
- return None
299
-
300
- def translate_with_libre_translate(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
301
- """Translate using LibreTranslate (free public API)"""
302
- try:
303
- # Map language codes
304
- lang_map = {
305
- 'si': 'si', # Sinhala
306
- 'en': 'en', # English
307
- 'auto': 'auto'
308
- }
309
-
310
- source = lang_map.get(source_lang, 'auto')
311
- target = lang_map.get(target_lang, 'en')
312
-
313
- payload = {
314
- 'q': text,
315
- 'source': source,
316
- 'target': target,
317
- 'format': 'text'
318
- }
319
-
320
- headers = {
321
- 'Content-Type': 'application/json'
322
- }
323
-
324
- response = requests.post(
325
- self.libre_translate_url,
326
- json=payload,
327
- headers=headers,
328
- timeout=10
329
- )
330
-
331
- if response.status_code == 200:
332
- result = response.json()
333
- translated = result.get('translatedText')
334
- self.logger.debug(f"LibreTranslate success: '{text}' -> '{translated}'")
335
- self.last_translation_method = 'libretranslate'
336
- return translated
337
-
338
- return None
339
-
340
- except Exception as e:
341
- self.logger.warning(f"LibreTranslate error: {e}")
342
- return None
343
-
344
- def translate_with_mymemory(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
345
- """Translate using MyMemory (free API)"""
346
- try:
347
- # Map language codes
348
- lang_map = {
349
- 'si': 'si', # Sinhala
350
- 'en': 'en', # English
351
- 'auto': 'auto'
352
- }
353
-
354
- source = lang_map.get(source_lang, 'auto')
355
- langpair = f"{source}|{target_lang}"
356
-
357
- params = {
358
- 'q': text,
359
- 'langpair': langpair
360
- }
361
-
362
- response = requests.get(
363
- self.mymemory_url,
364
- params=params,
365
- timeout=10
366
- )
367
-
368
- if response.status_code == 200:
369
- result = response.json()
370
- translated = result.get('responseData', {}).get('translatedText')
371
- self.logger.debug(f"MyMemory success: '{text}' -> '{translated}'")
372
- self.last_translation_method = 'mymemory'
373
- return translated
374
-
375
- return None
376
-
377
- except Exception as e:
378
- self.logger.warning(f"MyMemory translation error: {e}")
379
- return None
380
-
381
-
382
-
383
- def translate_with_dictionary(self, text: str, target_lang: str) -> str:
384
- """Translate using dictionary-based approach"""
385
- if target_lang == 'en':
386
- # Sinhala to English
387
- translated = text
388
- for sinhala, english in self.transport_terms.items():
389
- translated = translated.replace(sinhala, english)
390
- return translated
391
- elif target_lang == 'si':
392
- # English to Sinhala
393
- translated = text
394
- for sinhala, english in self.transport_terms.items():
395
- translated = translated.replace(english, sinhala)
396
- return translated
397
-
398
- return text
399
-
400
- def translate_text(self, text: str, target_lang: str, source_lang: str = 'auto') -> str:
401
- """Main translation method with multiple fallbacks"""
402
- if not text or not text.strip():
403
- return text
404
-
405
- # Try translation methods
406
- if self.force_llm_translation:
407
- translation_methods = [
408
- ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang))
409
- ]
410
- else:
411
- translation_methods = [
412
- ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang)),
413
- ('MyMemory', lambda: self.translate_with_mymemory(text, target_lang, source_lang)),
414
- ('LibreTranslate', lambda: self.translate_with_libre_translate(text, target_lang, source_lang)),
415
- ('Dictionary', lambda: self.translate_with_dictionary(text, target_lang))
416
- ]
417
-
418
- for method_name, method_func in translation_methods:
419
- try:
420
- result = method_func()
421
- if result and result.strip():
422
- self.logger.info(f"Translation successful using {method_name}")
423
- if not self.last_translation_method:
424
- self.last_translation_method = method_name.lower()
425
- return result.strip()
426
- except Exception as e:
427
- self.logger.warning(f"{method_name} translation failed: {e}")
428
- continue
429
-
430
- # Final fallback
431
- result = self.translate_with_dictionary(text, target_lang)
432
- self.last_translation_method = 'dictionary'
433
- return result
434
-
435
- def translate_query(self, query: str) -> Dict[str, Any]:
436
- """Translate a user query from Sinhala to English"""
437
- if not self.is_sinhala_text(query):
438
- return {
439
- 'is_sinhala': False,
440
- 'original_query': query,
441
- 'translated_query': query,
442
- 'translation_method': 'none'
443
- }
444
-
445
- # Optional: Sinhala-specific fare parsing (disabled by default unless USE_PATTERN_TRANSLATION=true)
446
- if self.use_pattern_translation:
447
- parsed = self._parse_sinhala_fare_query(query)
448
- if parsed:
449
- self.logger.info(f"Pattern-based Sinhala fare parse: '{query}' -> '{parsed}'")
450
- return {
451
- 'is_sinhala': True,
452
- 'original_query': query,
453
- 'translated_query': parsed,
454
- 'translation_method': 'pattern'
455
- }
456
-
457
- # Fallback: general translation to English
458
- translated = self.translate_text(query, 'en', 'si')
459
- # Normalize English synonyms to expected NLP vocabulary
460
- translated = self._normalize_english_query(translated)
461
- method = self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary')
462
- self.logger.info(f"Translated Sinhala query ({method}): '{query}' -> '{translated}'")
463
-
464
- return {
465
- 'is_sinhala': True,
466
- 'original_query': query,
467
- 'translated_query': translated,
468
- 'translation_method': method
469
- }
470
-
471
- def _normalize_english_query(self, text: str) -> str:
472
- """Normalize English synonyms to match NLP patterns (fare/price/cost)."""
473
- if not text:
474
- return text
475
- normalized = text
476
- replacements = {
477
- 'fees': 'fare',
478
- 'fee': 'fare',
479
- 'charges': 'cost',
480
- 'charge': 'cost',
481
- 'ticket price': 'fare',
482
- 'ticket fare': 'fare',
483
- 'bus ticket': 'bus fare',
484
- }
485
- # Lowercase operate, then restore original casing minimally by returning lowercase; downstream lowercases anyway
486
- lower = normalized.lower()
487
- for old, new in replacements.items():
488
- lower = lower.replace(old, new)
489
- return lower
490
-
491
- def translate_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
492
- """Translate response back to Sinhala"""
493
- translated_response = response.copy()
494
-
495
- # Translate the main message
496
- if 'message' in response:
497
- translated_response['message'] = self.translate_text(
498
- response['message'], 'si', 'en'
499
- )
500
-
501
- # Translate suggestions if any
502
- if 'suggestions' in response and response['suggestions']:
503
- translated_response['suggestions'] = [
504
- self.translate_text(suggestion, 'si', 'en')
505
- for suggestion in response['suggestions']
506
- ]
507
-
508
- # Translate corrections if any
509
- if 'corrections' in response and response['corrections']:
510
- translated_corrections = []
511
- for correction in response['corrections']:
512
- translated_correction = correction.copy()
513
- if 'original' in correction:
514
- translated_correction['original'] = self.translate_text(
515
- correction['original'], 'si', 'en'
516
- )
517
- if 'corrected' in correction:
518
- translated_correction['corrected'] = self.translate_text(
519
- correction['corrected'], 'si', 'en'
520
- )
521
- translated_corrections.append(translated_correction)
522
- translated_response['corrections'] = translated_corrections
523
-
524
- # Add translation metadata
525
- translated_response['translation_info'] = {
526
- 'translated': True,
527
- 'translation_method': 'llm' if self.openai_api_key else 'dictionary'
528
- }
529
-
530
- return translated_response
531
-
532
- def get_sinhala_examples(self) -> Dict[str, Any]:
533
- """Get example queries in Sinhala"""
534
- sinhala_examples = {
535
- 'fare_queries': [
536
- {
537
- 'query': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?',
538
- 'description': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව සොයන්න'
539
- },
540
- {
541
- 'query': 'මාතර සිට ගාල්ලට යන මිල කීයද?',
542
- 'description': 'මාතර සිට ගාල්ලට යන මිල සොයන්න'
543
- },
544
- {
545
- 'query': 'අනුරාධපුර සිට කොළඹට යන වාරිකය',
546
- 'description': 'අනුරාධපුර සිට කොළඹට යන වාරිකය සොයන්න'
547
- }
548
- ],
549
- 'comparison_queries': [
550
- {
551
- 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට ගාල්ලට යන ගාස්තු සසඳන්න',
552
- 'description': 'විවිධ මාර්ගවල ගාස්තු සසඳන්න'
553
- },
554
- {
555
- 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට අනුරාධපුරට යන ගාස්තුවල වෙනස කීයද?',
556
- 'description': 'මාර්ග දෙකක ගාස්තු වෙනස සොයන්න'
557
- }
558
- ],
559
- 'range_queries': [
560
- {
561
- 'query': 'රුපියල් 500 අඩු ගාස්තු සහිත මාර්ග සොයන්න',
562
- 'description': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග සොයන්න'
563
- },
564
- {
565
- 'query': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග පෙන්වන්න',
566
- 'description': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග සොයන්න'
567
- }
568
- ],
569
- 'recommendation_queries': [
570
- {
571
- 'query': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න',
572
- 'description': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න'
573
- },
574
- {
575
- 'query': 'ප්‍රසිද්ධ ගමනාන්ත පෙන්වන්න',
576
- 'description': 'ප්‍රසිද්ධ ගමනාන්ත සොයන්න'
577
- }
578
- ],
579
- 'statistical_queries': [
580
- {
581
- 'query': 'සාමාන්‍ය ගාස්තුව කීයද?',
582
- 'description': 'සාමාන්‍ය ගාස්තුව සොයන්න'
583
- },
584
- {
585
- 'query': 'දත්ත ගබඩා සංඛ්‍යාලේඛන',
586
- 'description': 'දත්ත ගබඩා සංඛ්‍යාලේඛන සොයන්න'
587
- }
588
- ]
589
- }
590
-
591
- return sinhala_examples
592
-
593
- def test_translation(self) -> Dict[str, Any]:
594
- """Test translation functionality on transportation-related Sinhala queries."""
595
- test_cases = [
596
- {
597
- 'sinhala': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?',
598
- 'expected_english': 'What is the bus fare from Colombo to Kandy?'
599
- },
600
- {
601
- 'sinhala': 'මාතර සිට ගාල්ලට යන මිල කීයද?',
602
- 'expected_english': 'How much is the price from Matara to Galle?'
603
- },
604
- {
605
- 'sinhala': 'කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සසඳා බලන්න.',
606
- 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.'
607
- },
608
- {
609
- 'sinhala': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.',
610
- 'expected_english': 'Show routes with fares under 500 rupees.'
611
- },
612
- {
613
- 'sinhala': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න.',
614
- 'expected_english': 'Recommend cheap routes.'
615
- },
616
- {
617
- 'sinhala': 'කොළඹ සිට යන මාර්ග මොනවාද?',
618
- 'expected_english': 'What routes depart from Colombo?'
619
- },
620
- {
621
- 'sinhala': 'සාමාන්‍ය ගාස්තුව කීයද?',
622
- 'expected_english': 'What is the average fare?'
623
- },
624
- {
625
- 'sinhala': 'කඩුවෙල සිට මාතර දක්වා සහ ගාල්ල දක්වා බස් ගාස්තු සසඳන්න.',
626
- 'expected_english': 'Compare bus fares from Kaduwela to Matara and to Galle.'
627
- },
628
- {
629
- 'sinhala': 'කොළඹ සිට ගාල්ල දක්වා ටිකට් මිල කීයද?',
630
- 'expected_english': 'What is the ticket price from Colombo to Galle?'
631
- },
632
- {
633
- 'sinhala': 'රුපියල් 1000 වැඩි ගාස්තු සහිත මාර්ග සදහන් කරන්න.',
634
- 'expected_english': 'List routes with fares over 1000 rupees.'
635
- }
636
- ]
637
-
638
- results = []
639
- total_exact = 0
640
- total_good = 0
641
- total_tests = len(test_cases)
642
-
643
- for test_case in test_cases:
644
- sinhala = test_case['sinhala']
645
- expected = test_case['expected_english']
646
- is_sinhala = self.is_sinhala_text(sinhala)
647
-
648
- # Reset method tracker and translate
649
- self.last_translation_method = None
650
- translated = self.translate_text(sinhala, 'en', 'si') or ''
651
-
652
- tr = translated.strip()
653
- ex = expected.strip()
654
- tr_low = tr.lower()
655
- ex_low = ex.lower()
656
-
657
- # Accuracy heuristic
658
- if tr_low == ex_low:
659
- accuracy = 'exact'
660
- total_exact += 1
661
- total_good += 1
662
- elif tr_low in ex_low or ex_low in tr_low:
663
- accuracy = 'good'
664
- total_good += 1
665
- else:
666
- accuracy = 'partial'
667
-
668
- # Intent preservation check for comparisons
669
- intent_preserved = True
670
- if '��සඳ' in sinhala or 'සසඳා' in sinhala:
671
- intent_preserved = ('compare' in tr_low)
672
-
673
- results.append({
674
- 'sinhala_query': sinhala,
675
- 'is_sinhala_detected': is_sinhala,
676
- 'translated_english': tr,
677
- 'expected_english': ex,
678
- 'translation_accuracy': accuracy,
679
- 'intent_preserved': intent_preserved,
680
- 'method_used': self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary')
681
- })
682
-
683
- summary = {
684
- 'total_tests': total_tests,
685
- 'exact_matches': total_exact,
686
- 'good_or_better': total_good,
687
- 'accuracy_rate_percent': round((total_good / total_tests) * 100, 2) if total_tests else 0
688
- }
689
-
690
- self.logger.info(f"Translation test summary: {summary}")
691
-
692
- return {
693
- 'translation_service_status': 'active',
694
- 'available_methods': {
695
- 'llm': self.openai_api_key is not None,
696
- 'libre_translate': True,
697
- 'mymemory': True,
698
- 'dictionary': True
699
- },
700
- 'summary': summary,
701
- 'test_results': results
702
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Translation Service for Multi-Language Support
4
+ Handles translation of queries and responses for Sinhala, Tamil, Singlish, and English
5
+ """
6
+
7
+ import requests
8
+ import json
9
+ import re
10
+ import openai
11
+ from typing import Dict, Any, Optional
12
+ from config import Config
13
+ from logger import get_logger
14
+ from language_detector import LanguageDetector
15
+
16
+ class TranslationService:
17
+ def __init__(self):
18
+ self.config = Config()
19
+ self.openai_api_key = getattr(self.config, 'OPENAI_API_KEY', None)
20
+ self.logger = get_logger(self.__class__.__name__)
21
+ self.language_detector = LanguageDetector()
22
+
23
+ # Controls
24
+ import os
25
+ self.use_pattern_translation = os.getenv('USE_PATTERN_TRANSLATION', 'false').lower() == 'true'
26
+ self.force_llm_translation = os.getenv('FORCE_LLM_TRANSLATION', 'false').lower() == 'true'
27
+ self.last_translation_method: Optional[str] = None
28
+
29
+ # Free translation APIs
30
+ self.libre_translate_url = "https://libretranslate.de/translate" # Free public instance
31
+ self.mymemory_url = "https://api.mymemory.translated.net/get"
32
+
33
+ # Tamil transport terms and their English equivalents
34
+ self.tamil_transport_terms = {
35
+ # Fare related
36
+ 'எவ்வளவு': 'how much',
37
+ 'விலை': 'price',
38
+ 'கட்டணம்': 'fare',
39
+ 'செலவு': 'cost',
40
+ 'பேருந்து கட்டணம்': 'bus fare',
41
+ 'ரயில் கட்டணம்': 'train fare',
42
+ 'டிக்கெட் விலை': 'ticket price',
43
+
44
+ # Locations
45
+ 'கொழும்பு': 'Colombo',
46
+ 'கண்டி': 'Kandy',
47
+ 'காலி': 'Galle',
48
+ 'மாத்தறை': 'Matara',
49
+ 'அனுராதபுரம்': 'Anuradhapura',
50
+ 'பனதுரை': 'Panadura',
51
+ 'அலுத்துகமா': 'Aluthgama',
52
+ 'நுகேகோடா': 'Nugegoda',
53
+ 'தெஹிவாலா': 'Dehiwala',
54
+ 'மொரட்டுவா': 'Moratuwa',
55
+
56
+ # Direction words
57
+ 'இருந்து': 'from',
58
+ 'வரை': 'to',
59
+ 'வழியாக': 'via',
60
+ 'மூலம்': 'through',
61
+
62
+ # Question words
63
+ 'எங்கே': 'where',
64
+ 'எப்போது': 'when',
65
+ 'எப்படி': 'how',
66
+ 'என்ன': 'what',
67
+ 'யார்': 'who',
68
+
69
+ # Comparison words
70
+ 'உடன்': 'with',
71
+ 'மற்றும்': 'and',
72
+ 'அல்லது': 'or',
73
+ 'அதிகம்': 'more',
74
+ 'குறைவு': 'less',
75
+ 'ஒரே': 'same',
76
+ 'வேறு': 'different',
77
+ 'ஒப்பிடு': 'compare',
78
+ 'வித்தியாசம்': 'difference',
79
+
80
+ # Time words
81
+ 'இப்போது': 'now',
82
+ 'இன்று': 'today',
83
+ 'நாளை': 'tomorrow',
84
+ 'நேற்று': 'yesterday',
85
+
86
+ # Common verbs
87
+ 'போ': 'go',
88
+ 'வா': 'come',
89
+ 'பார்': 'see',
90
+ 'தெரிந்து கொள்': 'know',
91
+ 'கண்டுபிடி': 'find',
92
+ 'கற்றுக்கொள்': 'learn',
93
+ 'பரிந்துரை': 'recommend',
94
+ 'கா���்டு': 'show',
95
+
96
+ # Numbers and currency
97
+ 'ரூபாய்': 'rupees',
98
+ 'ரூ': 'rupees',
99
+
100
+ # Common phrases
101
+ 'இடையில்': 'between',
102
+ 'உடன்': 'with',
103
+ 'பாதைகள்': 'routes',
104
+ 'பிரபலமான': 'popular',
105
+ 'சராசரி': 'average',
106
+ 'தரவு': 'data',
107
+ 'புள்ளிவிவரங்கள்': 'statistics'
108
+ }
109
+
110
+ # Common transport terms in Sinhala and their English equivalents
111
+ self.transport_terms = {
112
+ # Fare related
113
+ 'කීයද': 'how much',
114
+ 'මිල': 'price',
115
+ 'වාරික': 'fare',
116
+ 'වාරිකය': 'fare',
117
+ 'වාරිකව': 'fare',
118
+ 'ගාස්තු': 'fare',
119
+ 'ගාස්තුව': 'fare',
120
+ 'ප්‍රවාහන ගාස්තු': 'transport fare',
121
+ 'බස් ගාස්තු': 'bus fare',
122
+ 'බස් ගාස්තුව': 'bus fare',
123
+ 'රේල් ගාස්තු': 'train fare',
124
+ 'රේල් ගාස්තුව': 'train fare',
125
+
126
+ # Locations
127
+ 'කොළඹ': 'Colombo',
128
+ 'මහනුවර': 'Kandy',
129
+ 'මහනුවරට': 'Kandy',
130
+ 'ගාල්ල': 'Galle',
131
+ 'ගාල්ලට': 'Galle',
132
+ 'මාතර': 'Matara',
133
+ 'මාතරට': 'Matara',
134
+ 'අනුරාධපුර': 'Anuradhapura',
135
+ 'අනුරාධපුරට': 'Anuradhapura',
136
+ 'පානදුර': 'Panadura',
137
+ 'පානදුරට': 'Panadura',
138
+ 'අලුත්ගම': 'Aluthgama',
139
+ 'අලුත්ගමට': 'Aluthgama',
140
+ 'නුගේගොඩ': 'Nugegoda',
141
+ 'නුගේගොඩට': 'Nugegoda',
142
+ 'දෙහිවල': 'Dehiwala',
143
+ 'දෙහිවලට': 'Dehiwala',
144
+ 'මොරටුව': 'Moratuwa',
145
+ 'මොරටුවට': 'Moratuwa',
146
+
147
+ # Direction words
148
+ 'වලින්': 'from',
149
+ 'වල': 'from',
150
+ 'ට': 'to',
151
+ 'වෙත': 'to',
152
+ 'සිට': 'from',
153
+ 'දක්වා': 'to',
154
+ 'සි': 'from',
155
+
156
+ # Question words
157
+ 'කොහෙද': 'where',
158
+ 'කවදාද': 'when',
159
+ 'කොහොමද': 'how',
160
+ 'මොනවාද': 'what',
161
+ 'කවුද': 'who',
162
+
163
+ # Comparison words
164
+ 'සමඟ': 'with',
165
+ 'සහ': 'and',
166
+ 'හෝ': 'or',
167
+ 'වඩා': 'more',
168
+ 'අඩු': 'less',
169
+ 'සමාන': 'same',
170
+ 'වෙනස': 'different',
171
+ 'සසඳන්න': 'compare',
172
+ 'සසඳන': 'compare',
173
+
174
+ # Time words
175
+ 'දැන්': 'now',
176
+ 'අද': 'today',
177
+ 'හෙට': 'tomorrow',
178
+ 'ඊයේ': 'yesterday',
179
+
180
+ # Common verbs
181
+ 'යන්න': 'go',
182
+ 'යන': 'go',
183
+ 'එන්න': 'come',
184
+ 'බලන්න': 'see',
185
+ 'දැනගන්න': 'know',
186
+ 'සොයන්න': 'find',
187
+ 'සොයන': 'find',
188
+ 'ඉගෙනගන්න': 'learn',
189
+ 'නිර්දේශ': 'recommend',
190
+ 'නිර්දේශ කරන්න': 'recommend',
191
+ 'පෙන්වන්න': 'show',
192
+ 'පෙන්වන': 'show',
193
+
194
+ # Numbers and currency
195
+ 'රුපියල්': 'rupees',
196
+ 'රු': 'rupees',
197
+ 'රුපියල': 'rupees',
198
+
199
+ # Common phrases
200
+ 'අතර': 'between',
201
+ 'සහිත': 'with',
202
+ 'මාර්ග': 'routes',
203
+ 'මාර්ගවල': 'routes',
204
+ 'ගමනාන්ත': 'destinations',
205
+ 'ප්‍රසිද්ධ': 'popular',
206
+ 'සාමාන්‍ය': 'average',
207
+ 'සාමාන්‍යය': 'average',
208
+ 'දත්ත': 'data',
209
+ 'සංඛ්‍යාලේඛන': 'statistics'
210
+ }
211
+
212
+ # Sinhala script detection pattern
213
+ self.sinhala_pattern = re.compile(r'[\u0D80-\u0DFF]')
214
+
215
+ def is_sinhala_text(self, text: str) -> bool:
216
+ """Check if text contains Sinhala characters"""
217
+ detected = bool(self.sinhala_pattern.search(text))
218
+ self.logger.debug(f"Sinhala detection: detected={detected}, text='{text}'")
219
+ return detected
220
+
221
+ def is_tamil_text(self, text: str) -> bool:
222
+ """Check if text contains Tamil characters"""
223
+ tamil_pattern = re.compile(r'[\u0B80-\u0BFF]')
224
+ detected = bool(tamil_pattern.search(text))
225
+ self.logger.debug(f"Tamil detection: detected={detected}, text='{text}'")
226
+ return detected
227
+
228
+ def is_singlish_text(self, text: str) -> bool:
229
+ """Check if text is Singlish (Sinhala-English mixed)"""
230
+ detection_result = self.language_detector.detect_language(text)
231
+ return detection_result['language'] == 'singlish'
232
+
233
+ def _map_sinhala_place(self, text: str) -> str:
234
+ """Map a Sinhala place token to its English equivalent using known terms and suffix stripping."""
235
+ candidate = text.strip()
236
+ # Direct map
237
+ if candidate in self.transport_terms:
238
+ return self.transport_terms[candidate]
239
+ # Strip common Sinhala case particles/suffixes and try again
240
+ base = re.sub(r'(ට|වෙත|දක්වා|වලින්|වල|සිට)$', '', candidate)
241
+ if base in self.transport_terms:
242
+ return self.transport_terms[base]
243
+ return candidate
244
+
245
+ def _map_tamil_place(self, text: str) -> str:
246
+ """Map a Tamil place token to its English equivalent using known terms and suffix stripping."""
247
+ candidate = text.strip()
248
+ # Direct map
249
+ if candidate in self.tamil_transport_terms:
250
+ return self.tamil_transport_terms[candidate]
251
+ # Strip common Tamil case particles/suffixes and try again
252
+ base = re.sub(r'(இருந்து|வரை|வழியாக|மூலம்)$', '', candidate)
253
+ if base in self.tamil_transport_terms:
254
+ return self.tamil_transport_terms[base]
255
+ return candidate
256
+
257
+ def _parse_sinhala_fare_query(self, query: str) -> Optional[str]:
258
+ """Detect simple Sinhala fare queries and build a clean English query.
259
+ Example handled: "කොළඹ සිට මහනුවරට ගාස්තුව කීයද?" -> "What is the fare from Colombo to Kandy?"
260
+ """
261
+ try:
262
+ # Quick check for fare-related tokens to avoid false positives
263
+ if not any(tok in query for tok in ['ගාස්තු', 'ගාස්තුව', 'වාරික', 'වාරිකය', 'මිල']):
264
+ return None
265
+ # Extract source and destination around Sinhala "from" and "to" particles
266
+ m = re.search(r'([\u0D80-\u0DFF\s]+?)\s*සිට\s*([\u0D80-\u0DFF\s]+?)(?:ට|වෙත|දක්වා)', query)
267
+ if not m:
268
+ return None
269
+ src_si = m.group(1).strip()
270
+ dst_si = m.group(2).strip()
271
+ src_en = self._map_sinhala_place(src_si)
272
+ dst_en = self._map_sinhala_place(dst_si)
273
+ return f"What is the fare from {src_en} to {dst_en}?"
274
+ except Exception:
275
+ return None
276
+
277
+ def _parse_tamil_fare_query(self, query: str) -> Optional[str]:
278
+ """Detect simple Tamil fare queries and build a clean English query.
279
+ Example handled: "கொழும்பு இருந்து கண்டி வரை கட்டணம் எவ்வளவு?" -> "What is the fare from Colombo to Kandy?"
280
+ """
281
+ try:
282
+ # Quick check for fare-related tokens to avoid false positives
283
+ if not any(tok in query for tok in ['கட்டணம்', 'விலை', 'செலவு', 'எவ்வளவு']):
284
+ return None
285
+ # Extract source and destination around Tamil "from" and "to" particles
286
+ m = re.search(r'([\u0B80-\u0BFF\s]+?)\s*இருந்து\s*([\u0B80-\u0BFF\s]+?)(?:வரை|வழியாக)', query)
287
+ if not m:
288
+ return None
289
+ src_ta = m.group(1).strip()
290
+ dst_ta = m.group(2).strip()
291
+ src_en = self._map_tamil_place(src_ta)
292
+ dst_en = self._map_tamil_place(dst_ta)
293
+ return f"What is the fare from {src_en} to {dst_en}?"
294
+ except Exception:
295
+ return None
296
+
297
+ def translate_with_llm(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
298
+ """Translate using OpenAI LLM (new SDK). Preserve original intent (comparison, lists, conjunctions)."""
299
+ if not self.openai_api_key:
300
+ return None
301
+
302
+ try:
303
+ # Determine source language
304
+ if source_lang == 'auto':
305
+ detection_result = self.language_detector.detect_language(text)
306
+ detected_lang = detection_result['language']
307
+ if detected_lang == 'sinhala':
308
+ source_lang = 'si'
309
+ elif detected_lang == 'tamil':
310
+ source_lang = 'ta'
311
+ elif detected_lang == 'singlish':
312
+ source_lang = 'singlish'
313
+ else:
314
+ source_lang = 'en'
315
+
316
+ # Create language mapping
317
+ lang_map = {
318
+ ('si', 'en'): 'Sinhala to English',
319
+ ('en', 'si'): 'English to Sinhala',
320
+ ('ta', 'en'): 'Tamil to English',
321
+ ('en', 'ta'): 'English to Tamil',
322
+ ('singlish', 'en'): 'Singlish to English'
323
+ }
324
+
325
+ direction = lang_map.get((source_lang, target_lang))
326
+ if not direction:
327
+ return None
328
+
329
+ prompt = f"""
330
+ Translate the following text from {direction}.
331
+ Output only the translated text without quotes or extra commentary.
332
+ Critically: Preserve the original intent and structure. Do not simplify.
333
+ - If it is a comparison (e.g., includes "සසඳා බලන්න"/"සසඳන්න"), translate as a comparison (e.g., "Compare ...").
334
+ - Preserve conjunctions like "සහ" as "and" and keep all mentioned routes.
335
+ - Keep direction words ("සිට" = from, "ට/වෙත/දක්වා" = to) and render routes fully.
336
+ Use standard English city names:
337
+ - මහනුවර = Kandy (not Mahanuwara)
338
+ - කොළඹ = Colombo
339
+ - ගාල්ල = Galle
340
+ - මාතර = Matara
341
+ - අනුරාධපුර = Anuradhapura
342
+
343
+ Text to translate: {text}
344
+ """
345
+
346
+ # Build few-shot examples to preserve comparison/imperative structure
347
+ examples = []
348
+
349
+ # Add examples based on source language
350
+ if source_lang == 'si':
351
+ examples = [
352
+ (
353
+ "කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?",
354
+ "What is the bus fare from Colombo to Kandy?"
355
+ ),
356
+ (
357
+ "කොළඹ සිට ගාල්ල දක්වා ටිකට් මිල කීයද?",
358
+ "What is the ticket price from Colombo to Galle?"
359
+ ),
360
+ (
361
+ "කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල்ල දක්වා ගාස්තු සසඳා බලන්න.",
362
+ "Compare fares from Colombo to Panadura and from Colombo to Galle."
363
+ ),
364
+ (
365
+ "රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.",
366
+ "Show routes with fares under 500 rupees."
367
+ ),
368
+ (
369
+ "අඩු මිලේ මාර්ග නිර්දේශ කරන්න.",
370
+ "Recommend cheap routes."
371
+ ),
372
+ ]
373
+ elif source_lang == 'ta':
374
+ examples = [
375
+ (
376
+ "கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?",
377
+ "What is the bus fare from Colombo to Kandy?"
378
+ ),
379
+ (
380
+ "கொழும்பு இருந்து காலி வரை டிக்கெட் விலை எவ்வளவு?",
381
+ "What is the ticket price from Colombo to Galle?"
382
+ ),
383
+ (
384
+ "கொழும்பு இருந்து பனதுரை வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு.",
385
+ "Compare fares from Colombo to Panadura and from Colombo to Galle."
386
+ ),
387
+ (
388
+ "ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை காட்டு.",
389
+ "Show routes with fares under 500 rupees."
390
+ ),
391
+ (
392
+ "குறைந்த விலையில் பாதைகளை பரிந்துரை.",
393
+ "Recommend cheap routes."
394
+ ),
395
+ ]
396
+ elif source_lang == 'singlish':
397
+ examples = [
398
+ (
399
+ "කොළඹ සිට Kandy ගාස්තුව කීයද?",
400
+ "What is the fare from Colombo to Kandy?"
401
+ ),
402
+ (
403
+ "Colombo සිට ගාල්ල�� bus fare කීයද?",
404
+ "What is the bus fare from Colombo to Galle?"
405
+ ),
406
+ (
407
+ "කොළඹ සිට Panadura සහ Colombo සිට Galle fares compare කරන්න.",
408
+ "Compare fares from Colombo to Panadura and from Colombo to Galle."
409
+ ),
410
+ ]
411
+
412
+ # Compose messages with few-shot conditioning
413
+ def build_messages(txt: str):
414
+ msgs = [
415
+ {
416
+ "role": "system",
417
+ "content": (
418
+ "You are a professional translator. Translate accurately and naturally. "
419
+ "Preserve imperative/comparative intent and list structure. Do not paraphrase. "
420
+ "Return only the English translation without quotes. "
421
+ "Canonical phrasing rules (use exactly): \n"
422
+ "- Use 'Compare' for comparison requests.\n"
423
+ "- Use 'Show' for requests like 'පෙන්වන්න' (do not use Provide/List).\n"
424
+ "- Use 'How much is the' for 'කීයද' fare/price questions.\n"
425
+ "- Use 'cheap' (not 'affordable').\n"
426
+ "- Use 'under' (not 'below') for '< value'.\n"
427
+ ),
428
+ },
429
+ {
430
+ "role": "user",
431
+ "content": (
432
+ "Instructions: Preserve structure. Use 'Compare' for 'සසඳ', use 'from' for 'සිට' and 'to' for 'ට/වෙත/දක්වා'.\n"
433
+ "Use exact place names: මහනුවර=Kandy, කොළඹ=Colombo, ගාල්ල=Galle, මාතර=Matara, අනුරාධපුර=Anuradhapura."
434
+ ),
435
+ },
436
+ ]
437
+ for si, en in examples:
438
+ msgs.append({"role": "user", "content": f"Sinhala: {si}\nEnglish:"})
439
+ msgs.append({"role": "assistant", "content": en})
440
+ msgs.append({"role": "user", "content": f"Sinhala: {txt}\nEnglish:"})
441
+ return msgs
442
+
443
+ # Use new OpenAI SDK
444
+ try:
445
+ from openai import OpenAI
446
+ client = OpenAI(api_key=self.openai_api_key)
447
+ response = client.chat.completions.create(
448
+ model="gpt-3.5-turbo",
449
+ max_tokens=150,
450
+ temperature=0.3,
451
+ messages=build_messages(text)
452
+ )
453
+ translated = response.choices[0].message.content.strip()
454
+ self.last_translation_method = 'llm'
455
+ except Exception as sdk_err:
456
+ # Fallback to legacy API if available
457
+ import openai
458
+ try:
459
+ openai.api_key = self.openai_api_key
460
+ response = openai.ChatCompletion.create(
461
+ model="gpt-3.5-turbo",
462
+ max_tokens=150,
463
+ temperature=0.3,
464
+ messages=build_messages(text)
465
+ )
466
+ translated = response.choices[0].message.content.strip()
467
+ self.last_translation_method = 'llm'
468
+ except Exception:
469
+ raise sdk_err
470
+
471
+ if translated.startswith('"') and translated.endswith('"'):
472
+ translated = translated[1:-1]
473
+ return translated if translated else None
474
+ except Exception as e:
475
+ self.logger.warning(f"LLM translation error: {e}")
476
+ return None
477
+
478
+ def translate_with_libre_translate(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
479
+ """Translate using LibreTranslate (free public API)"""
480
+ try:
481
+ # Map language codes
482
+ lang_map = {
483
+ 'si': 'si', # Sinhala
484
+ 'en': 'en', # English
485
+ 'auto': 'auto'
486
+ }
487
+
488
+ source = lang_map.get(source_lang, 'auto')
489
+ target = lang_map.get(target_lang, 'en')
490
+
491
+ payload = {
492
+ 'q': text,
493
+ 'source': source,
494
+ 'target': target,
495
+ 'format': 'text'
496
+ }
497
+
498
+ headers = {
499
+ 'Content-Type': 'application/json'
500
+ }
501
+
502
+ response = requests.post(
503
+ self.libre_translate_url,
504
+ json=payload,
505
+ headers=headers,
506
+ timeout=10
507
+ )
508
+
509
+ if response.status_code == 200:
510
+ result = response.json()
511
+ translated = result.get('translatedText')
512
+ self.logger.debug(f"LibreTranslate success: '{text}' -> '{translated}'")
513
+ self.last_translation_method = 'libretranslate'
514
+ return translated
515
+
516
+ return None
517
+
518
+ except Exception as e:
519
+ self.logger.warning(f"LibreTranslate error: {e}")
520
+ return None
521
+
522
+ def translate_with_mymemory(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]:
523
+ """Translate using MyMemory (free API)"""
524
+ try:
525
+ # Map language codes
526
+ lang_map = {
527
+ 'si': 'si', # Sinhala
528
+ 'en': 'en', # English
529
+ 'auto': 'auto'
530
+ }
531
+
532
+ source = lang_map.get(source_lang, 'auto')
533
+ langpair = f"{source}|{target_lang}"
534
+
535
+ params = {
536
+ 'q': text,
537
+ 'langpair': langpair
538
+ }
539
+
540
+ response = requests.get(
541
+ self.mymemory_url,
542
+ params=params,
543
+ timeout=10
544
+ )
545
+
546
+ if response.status_code == 200:
547
+ result = response.json()
548
+ translated = result.get('responseData', {}).get('translatedText')
549
+ self.logger.debug(f"MyMemory success: '{text}' -> '{translated}'")
550
+ self.last_translation_method = 'mymemory'
551
+ return translated
552
+
553
+ return None
554
+
555
+ except Exception as e:
556
+ self.logger.warning(f"MyMemory translation error: {e}")
557
+ return None
558
+
559
+
560
+
561
+ def translate_with_dictionary(self, text: str, target_lang: str, source_lang: str = 'auto') -> str:
562
+ """Translate using dictionary-based approach"""
563
+ if target_lang == 'en':
564
+ # Determine source language if auto
565
+ if source_lang == 'auto':
566
+ detection_result = self.language_detector.detect_language(text)
567
+ detected_lang = detection_result['language']
568
+ if detected_lang == 'sinhala':
569
+ source_lang = 'si'
570
+ elif detected_lang == 'tamil':
571
+ source_lang = 'ta'
572
+ else:
573
+ source_lang = 'si' # Default to Sinhala
574
+
575
+ translated = text
576
+
577
+ if source_lang == 'si':
578
+ # Sinhala to English
579
+ for sinhala, english in self.transport_terms.items():
580
+ translated = translated.replace(sinhala, english)
581
+ elif source_lang == 'ta':
582
+ # Tamil to English
583
+ for tamil, english in self.tamil_transport_terms.items():
584
+ translated = translated.replace(tamil, english)
585
+
586
+ return translated
587
+ elif target_lang == 'si':
588
+ # English to Sinhala
589
+ translated = text
590
+ for sinhala, english in self.transport_terms.items():
591
+ translated = translated.replace(english, sinhala)
592
+ return translated
593
+ elif target_lang == 'ta':
594
+ # English to Tamil
595
+ translated = text
596
+ for tamil, english in self.tamil_transport_terms.items():
597
+ translated = translated.replace(english, tamil)
598
+ return translated
599
+
600
+ return text
601
+
602
+ def translate_text(self, text: str, target_lang: str, source_lang: str = 'auto') -> str:
603
+ """Main translation method with multiple fallbacks"""
604
+ if not text or not text.strip():
605
+ return text
606
+
607
+ # Try translation methods
608
+ if self.force_llm_translation:
609
+ translation_methods = [
610
+ ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang))
611
+ ]
612
+ else:
613
+ translation_methods = [
614
+ ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang)),
615
+ ('MyMemory', lambda: self.translate_with_mymemory(text, target_lang, source_lang)),
616
+ ('LibreTranslate', lambda: self.translate_with_libre_translate(text, target_lang, source_lang)),
617
+ ('Dictionary', lambda: self.translate_with_dictionary(text, target_lang))
618
+ ]
619
+
620
+ for method_name, method_func in translation_methods:
621
+ try:
622
+ result = method_func()
623
+ if result and result.strip():
624
+ self.logger.info(f"Translation successful using {method_name}")
625
+ if not self.last_translation_method:
626
+ self.last_translation_method = method_name.lower()
627
+ return result.strip()
628
+ except Exception as e:
629
+ self.logger.warning(f"{method_name} translation failed: {e}")
630
+ continue
631
+
632
+ # Final fallback
633
+ result = self.translate_with_dictionary(text, target_lang, source_lang)
634
+ self.last_translation_method = 'dictionary'
635
+ return result
636
+
637
+ def translate_query(self, query: str) -> Dict[str, Any]:
638
+ """Translate a user query from any supported language to English"""
639
+ # Detect the language of the input
640
+ detection_result = self.language_detector.detect_language(query)
641
+ detected_language = detection_result['language']
642
+
643
+ # If it's already English, return as is
644
+ if detected_language == 'english':
645
+ return {
646
+ 'is_sinhala': False,
647
+ 'is_tamil': False,
648
+ 'is_singlish': False,
649
+ 'detected_language': 'english',
650
+ 'original_query': query,
651
+ 'translated_query': query,
652
+ 'translation_method': 'none',
653
+ 'detection_confidence': detection_result['confidence']
654
+ }
655
+
656
+ # Handle pattern-based parsing for specific languages
657
+ if self.use_pattern_translation:
658
+ parsed = None
659
+ if detected_language == 'sinhala':
660
+ parsed = self._parse_sinhala_fare_query(query)
661
+ elif detected_language == 'tamil':
662
+ parsed = self._parse_tamil_fare_query(query)
663
+
664
+ if parsed:
665
+ self.logger.info(f"Pattern-based {detected_language} fare parse: '{query}' -> '{parsed}'")
666
+ return {
667
+ 'is_sinhala': detected_language == 'sinhala',
668
+ 'is_tamil': detected_language == 'tamil',
669
+ 'is_singlish': detected_language == 'singlish',
670
+ 'detected_language': detected_language,
671
+ 'original_query': query,
672
+ 'translated_query': parsed,
673
+ 'translation_method': 'pattern',
674
+ 'detection_confidence': detection_result['confidence']
675
+ }
676
+
677
+ # Determine source language code for translation
678
+ source_lang = 'si' if detected_language == 'sinhala' else 'ta' if detected_language == 'tamil' else 'si'
679
+
680
+ # Translate to English
681
+ translated = self.translate_text(query, 'en', source_lang)
682
+ # Normalize English synonyms to expected NLP vocabulary
683
+ translated = self._normalize_english_query(translated)
684
+ method = self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary')
685
+
686
+ self.logger.info(f"Translated {detected_language} query ({method}): '{query}' -> '{translated}'")
687
+
688
+ return {
689
+ 'is_sinhala': detected_language == 'sinhala',
690
+ 'is_tamil': detected_language == 'tamil',
691
+ 'is_singlish': detected_language == 'singlish',
692
+ 'detected_language': detected_language,
693
+ 'original_query': query,
694
+ 'translated_query': translated,
695
+ 'translation_method': method,
696
+ 'detection_confidence': detection_result['confidence']
697
+ }
698
+
699
+ def _normalize_english_query(self, text: str) -> str:
700
+ """Normalize English synonyms to match NLP patterns (fare/price/cost)."""
701
+ if not text:
702
+ return text
703
+ normalized = text
704
+ replacements = {
705
+ 'fees': 'fare',
706
+ 'fee': 'fare',
707
+ 'charges': 'cost',
708
+ 'charge': 'cost',
709
+ 'ticket price': 'fare',
710
+ 'ticket fare': 'fare',
711
+ 'bus ticket': 'bus fare',
712
+ }
713
+ # Lowercase operate, then restore original casing minimally by returning lowercase; downstream lowercases anyway
714
+ lower = normalized.lower()
715
+ for old, new in replacements.items():
716
+ lower = lower.replace(old, new)
717
+ return lower
718
+
719
+ def translate_response(self, response: Dict[str, Any], target_language: str = None) -> Dict[str, Any]:
720
+ """Translate response back to the detected language"""
721
+ translated_response = response.copy()
722
+
723
+ # Determine target language from translation_info if not provided
724
+ if target_language is None and 'translation_info' in response:
725
+ translation_info = response['translation_info']
726
+ if translation_info.get('detected_language'):
727
+ detected_lang = translation_info['detected_language']
728
+ if detected_lang == 'sinhala':
729
+ target_language = 'si'
730
+ elif detected_lang == 'tamil':
731
+ target_language = 'ta'
732
+ else:
733
+ target_language = 'si' # Default to Sinhala
734
+ else:
735
+ target_language = 'si' # Default to Sinhala
736
+ elif target_language is None:
737
+ target_language = 'si' # Default to Sinhala
738
+
739
+ # Translate the main message
740
+ if 'message' in response:
741
+ translated_response['message'] = self.translate_text(
742
+ response['message'], target_language, 'en'
743
+ )
744
+
745
+ # Translate suggestions if any
746
+ if 'suggestions' in response and response['suggestions']:
747
+ translated_response['suggestions'] = [
748
+ self.translate_text(suggestion, target_language, 'en')
749
+ for suggestion in response['suggestions']
750
+ ]
751
+
752
+ # Translate corrections if any
753
+ if 'corrections' in response and response['corrections']:
754
+ translated_corrections = []
755
+ for correction in response['corrections']:
756
+ translated_correction = correction.copy()
757
+ if 'original' in correction:
758
+ translated_correction['original'] = self.translate_text(
759
+ correction['original'], target_language, 'en'
760
+ )
761
+ if 'corrected' in correction:
762
+ translated_correction['corrected'] = self.translate_text(
763
+ correction['corrected'], target_language, 'en'
764
+ )
765
+ translated_corrections.append(translated_correction)
766
+ translated_response['corrections'] = translated_corrections
767
+
768
+ # Add translation metadata
769
+ translated_response['translation_info'] = {
770
+ 'translated': True,
771
+ 'target_language': target_language,
772
+ 'translation_method': 'llm' if self.openai_api_key else 'dictionary'
773
+ }
774
+
775
+ return translated_response
776
+
777
+ def get_sinhala_examples(self) -> Dict[str, Any]:
778
+ """Get example queries in Sinhala"""
779
+ sinhala_examples = {
780
+ 'fare_queries': [
781
+ {
782
+ 'query': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?',
783
+ 'description': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව සොයන්න'
784
+ },
785
+ {
786
+ 'query': 'මාතර සිට ගාල්ලට යන මිල කීයද?',
787
+ 'description': 'මාතර සිට ගාල්ලට යන මිල සොයන්න'
788
+ },
789
+ {
790
+ 'query': 'අනුරාධපුර සිට කොළඹට යන වාරිකය',
791
+ 'description': 'අනුරාධපුර සිට කොළඹට යන වාරිකය සොයන්න'
792
+ }
793
+ ],
794
+ 'comparison_queries': [
795
+ {
796
+ 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට ගාල්ලට යන ගාස්තු සසඳන්න',
797
+ 'description': 'විවිධ මාර්ගවල ගාස්තු සසඳන්න'
798
+ },
799
+ {
800
+ 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට අනුරාධපුරට යන ගාස්තුවල වෙනස කීයද?',
801
+ 'description': 'මාර්ග දෙකක ගාස්තු වෙනස සොයන්න'
802
+ }
803
+ ],
804
+ 'range_queries': [
805
+ {
806
+ 'query': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග සොයන්න',
807
+ 'description': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග සොයන්න'
808
+ },
809
+ {
810
+ 'query': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග පෙන්වන්න',
811
+ 'description': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග සොයන්න'
812
+ }
813
+ ],
814
+ 'recommendation_queries': [
815
+ {
816
+ 'query': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න',
817
+ 'description': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න'
818
+ },
819
+ {
820
+ 'query': 'ප්‍රසිද්ධ ගමනාන්ත පෙන්වන්න',
821
+ 'description': 'ප්‍රසිද්ධ ගමනාන්ත සොයන්න'
822
+ }
823
+ ],
824
+ 'statistical_queries': [
825
+ {
826
+ 'query': 'සාමාන්‍ය ගාස්තුව කීයද?',
827
+ 'description': 'සාමාන්‍ය ගාස්තුව සොයන්න'
828
+ },
829
+ {
830
+ 'query': 'දත්ත ගබඩා සංඛ්‍යාලේඛන',
831
+ 'description': 'දත්ත ගබඩා සංඛ්‍යාලේඛන සොයන්න'
832
+ }
833
+ ]
834
+ }
835
+
836
+ return sinhala_examples
837
+
838
+ def get_tamil_examples(self) -> Dict[str, Any]:
839
+ """Get example queries in Tamil"""
840
+ tamil_examples = {
841
+ 'fare_queries': [
842
+ {
843
+ 'query': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?',
844
+ 'description': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் கண்டுபிடி'
845
+ },
846
+ {
847
+ 'query': 'மாத்தறை இருந்து காலி வரை விலை எவ்வளவு?',
848
+ 'description': 'மாத்தறை இருந்து காலி வரை விலை கண்டுபிடி'
849
+ },
850
+ {
851
+ 'query': 'அனுராதபுரம் இருந்து கொழும்பு வரை கட்டணம்',
852
+ 'description': 'அனுராதபுரம் இருந்து கொழும்பு வரை கட்டணம் கண்டுபிடி'
853
+ }
854
+ ],
855
+ 'comparison_queries': [
856
+ {
857
+ 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு',
858
+ 'description': 'வெவ்வேறு பாதைகளின் கட்டணம் ஒப்பிடு'
859
+ },
860
+ {
861
+ 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து அனுராதபுரம் வரை கட்டணத்தின் வித்தியாசம் எவ்வளவு?',
862
+ 'description': 'இரண்டு பாதைகளின் கட்டண வித்தியாசம் கண்டுபிடி'
863
+ }
864
+ ],
865
+ 'range_queries': [
866
+ {
867
+ 'query': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை கண்டுபிடி',
868
+ 'description': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை கண்டுபிடி'
869
+ },
870
+ {
871
+ 'query': 'ரூபாய் 200 மற்றும் 800 இடையில் கட்டணம் உள்ள பாதைகளை காட்டு',
872
+ 'description': 'ரூபாய் 200 மற்றும் 800 இடையில் கட்டணம் உள்ள பாதைகளை கண்டுபிடி'
873
+ }
874
+ ],
875
+ 'recommendation_queries': [
876
+ {
877
+ 'query': 'குறைந்த விலையில் பாதைகளை பரிந்துரை',
878
+ 'description': 'குறைந்த விலையில் பாதைகளை பரிந்துரை'
879
+ },
880
+ {
881
+ 'query': 'பிரபலமான இலக்குகளை காட்டு',
882
+ 'description': 'பிரபலமான இலக்குகளை கண்டுபிடி'
883
+ }
884
+ ],
885
+ 'statistical_queries': [
886
+ {
887
+ 'query': 'சராசரி கட்டணம் எவ்வளவு?',
888
+ 'description': 'சராசரி கட்டணம் கண்டுபிடி'
889
+ },
890
+ {
891
+ 'query': 'தரவு சேமிப்பக புள்ளிவிவரங்கள்',
892
+ 'description': 'தரவு சேமிப்பக புள்ளிவிவரங்கள் கண்டுபிடி'
893
+ }
894
+ ]
895
+ }
896
+
897
+ return tamil_examples
898
+
899
+ def test_translation(self) -> Dict[str, Any]:
900
+ """Test translation functionality on transportation-related queries in multiple languages."""
901
+ test_cases = [
902
+ # Sinhala test cases
903
+ {
904
+ 'language': 'sinhala',
905
+ 'original': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?',
906
+ 'expected_english': 'What is the bus fare from Colombo to Kandy?'
907
+ },
908
+ {
909
+ 'language': 'sinhala',
910
+ 'original': 'මාතර සිට ගාල්ලට යන මිල කීයද?',
911
+ 'expected_english': 'How much is the price from Matara to Galle?'
912
+ },
913
+ {
914
+ 'language': 'sinhala',
915
+ 'original': 'කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සසඳා බලන්න.',
916
+ 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.'
917
+ },
918
+ {
919
+ 'language': 'sinhala',
920
+ 'original': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.',
921
+ 'expected_english': 'Show routes with fares under 500 rupees.'
922
+ },
923
+ {
924
+ 'language': 'sinhala',
925
+ 'original': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න.',
926
+ 'expected_english': 'Recommend cheap routes.'
927
+ },
928
+
929
+ # Tamil test cases
930
+ {
931
+ 'language': 'tamil',
932
+ 'original': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?',
933
+ 'expected_english': 'What is the bus fare from Colombo to Kandy?'
934
+ },
935
+ {
936
+ 'language': 'tamil',
937
+ 'original': 'மாத்தறை இருந்து காலி வரை விலை எவ்வளவு?',
938
+ 'expected_english': 'How much is the price from Matara to Galle?'
939
+ },
940
+ {
941
+ 'language': 'tamil',
942
+ 'original': 'கொழும்பு இருந்து பனதுரை வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு.',
943
+ 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.'
944
+ },
945
+ {
946
+ 'language': 'tamil',
947
+ 'original': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை காட்டு.',
948
+ 'expected_english': 'Show routes with fares under 500 rupees.'
949
+ },
950
+ {
951
+ 'language': 'tamil',
952
+ 'original': 'குறைந்த விலையில் பாதைகளை பரிந்துரை.',
953
+ 'expected_english': 'Recommend cheap routes.'
954
+ },
955
+
956
+ # Singlish test cases
957
+ {
958
+ 'language': 'singlish',
959
+ 'original': 'කොළඹ සිට Kandy ගාස්තුව කීයද?',
960
+ 'expected_english': 'What is the fare from Colombo to Kandy?'
961
+ },
962
+ {
963
+ 'language': 'singlish',
964
+ 'original': 'Colombo සිට ගාල්ලට bus fare කීයද?',
965
+ 'expected_english': 'What is the bus fare from Colombo to Galle?'
966
+ },
967
+ {
968
+ 'language': 'singlish',
969
+ 'original': 'කොළඹ සිට Panadura සහ Colombo සිට Galle fares compare කරන්න.',
970
+ 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.'
971
+ },
972
+
973
+ # English test cases
974
+ {
975
+ 'language': 'english',
976
+ 'original': 'What is the fare from Colombo to Kandy?',
977
+ 'expected_english': 'What is the fare from Colombo to Kandy?'
978
+ },
979
+ {
980
+ 'language': 'english',
981
+ 'original': 'Show me routes from Panadura',
982
+ 'expected_english': 'Show me routes from Panadura'
983
+ }
984
+ ]
985
+
986
+ results = []
987
+ total_exact = 0
988
+ total_good = 0
989
+ total_tests = len(test_cases)
990
+
991
+ for test_case in test_cases:
992
+ original = test_case['original']
993
+ expected = test_case['expected_english']
994
+ language = test_case['language']
995
+
996
+ # Detect language
997
+ detection_result = self.language_detector.detect_language(original)
998
+ detected_language = detection_result['language']
999
+
1000
+ # Reset method tracker and translate
1001
+ self.last_translation_method = None
1002
+ translated = self.translate_text(original, 'en', 'auto') or ''
1003
+
1004
+ tr = translated.strip()
1005
+ ex = expected.strip()
1006
+ tr_low = tr.lower()
1007
+ ex_low = ex.lower()
1008
+
1009
+ # Accuracy heuristic
1010
+ if tr_low == ex_low:
1011
+ accuracy = 'exact'
1012
+ total_exact += 1
1013
+ total_good += 1
1014
+ elif tr_low in ex_low or ex_low in tr_low:
1015
+ accuracy = 'good'
1016
+ total_good += 1
1017
+ else:
1018
+ accuracy = 'partial'
1019
+
1020
+ # Intent preservation check for comparisons
1021
+ intent_preserved = True
1022
+ if language in ['sinhala', 'tamil'] and ('සසඳ' in original or 'ஒப்பிடு' in original):
1023
+ intent_preserved = ('compare' in tr_low)
1024
+
1025
+ results.append({
1026
+ 'original_query': original,
1027
+ 'language': language,
1028
+ 'detected_language': detected_language,
1029
+ 'translated_english': tr,
1030
+ 'expected_english': ex,
1031
+ 'translation_accuracy': accuracy,
1032
+ 'intent_preserved': intent_preserved,
1033
+ 'method_used': self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary'),
1034
+ 'detection_confidence': detection_result['confidence']
1035
+ })
1036
+
1037
+ summary = {
1038
+ 'total_tests': total_tests,
1039
+ 'exact_matches': total_exact,
1040
+ 'good_or_better': total_good,
1041
+ 'accuracy_rate_percent': round((total_good / total_tests) * 100, 2) if total_tests else 0
1042
+ }
1043
+
1044
+ self.logger.info(f"Translation test summary: {summary}")
1045
+
1046
+ return {
1047
+ 'translation_service_status': 'active',
1048
+ 'supported_languages': ['sinhala', 'tamil', 'singlish', 'english'],
1049
+ 'available_methods': {
1050
+ 'llm': self.openai_api_key is not None,
1051
+ 'libre_translate': True,
1052
+ 'mymemory': True,
1053
+ 'dictionary': True
1054
+ },
1055
+ 'summary': summary,
1056
+ 'test_results': results
1057
+ }