File size: 4,416 Bytes
7c012de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env node

// Test enhanced URL validation specifically for ArXiv and other problematic URLs
async function testEnhancedValidation() {
  console.log('πŸ” Testing Enhanced URL Validation...\n');
  
  const testUrls = [
    // Valid ArXiv URLs
    'https://arxiv.org/abs/2001.08361',  // Real paper
    'https://arxiv.org/abs/1706.03762',  // Attention is All You Need
    
    // Invalid ArXiv URLs (the problematic ones)
    'https://arxiv.org/abs/2024.rag.advances',  // Invalid format
    'https://arxiv.org/abs/2024.fake.paper',    // Invalid format
    'https://arxiv.org/abs/9999.99999',         // Non-existent paper
    
    // Other problematic URLs
    'https://vldb.org/vector-db-2024',          // 404 page
    'https://cvpr.org',                         // Unreachable
  ];
  
  console.log('πŸ§ͺ Testing individual URLs with enhanced validation...\n');
  
  for (const url of testUrls) {
    try {
      console.log(`Testing: ${url}`);
      
      // Simulate the validation logic
      const urlObj = new URL(url);
      
      if (urlObj.hostname.includes('arxiv.org')) {
        // Test ArXiv validation
        const match = url.match(/arxiv\.org\/abs\/(.+)$/);
        if (match) {
          const paperId = match[1];
          console.log(`  ArXiv ID: ${paperId}`);
          
          // Check format
          const validFormats = [
            /^\d{4}\.\d{4,5}$/, // New format: 2024.12345
            /^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567
          ];
          
          const hasValidFormat = validFormats.some(regex => regex.test(paperId));
          console.log(`  Format valid: ${hasValidFormat}`);
          
          if (!hasValidFormat) {
            console.log(`  Result: ❌ INVALID (bad format)`);
            console.log('');
            continue;
          }
        }
      }
      
      // Test actual URL
      const response = await fetch(url, {
        method: 'GET',
        signal: AbortSignal.timeout(5000),
        headers: {
          'User-Agent': 'Knowledge-Base-Browser/1.0 (Enhanced Validator)'
        }
      });
      
      console.log(`  Status: ${response.status}`);
      
      if (!response.ok) {
        console.log(`  Result: ❌ INVALID (${response.status})`);
      } else {
        // Check content for errors
        const content = await response.text();
        const errorIndicators = [
          'not recognized',
          'might instead try to search',
          'article identifier',
          'not found',
          'error'
        ];
        
        const hasError = errorIndicators.some(indicator => 
          content.toLowerCase().includes(indicator.toLowerCase())
        );
        
        if (hasError) {
          console.log(`  Content: Contains error messages`);
          console.log(`  Result: ❌ INVALID (error content)`);
        } else {
          console.log(`  Content: Valid`);
          console.log(`  Result: βœ… VALID`);
        }
      }
      
    } catch (error) {
      console.log(`  Error: ${error.message}`);
      console.log(`  Result: ❌ INVALID (network error)`);
    }
    console.log('');
  }
  
  console.log('πŸ”Ž Testing search with enhanced validation...\n');
  
  // Test the search endpoint to see if problematic URLs are filtered
  try {
    const response = await fetch('http://localhost:5000/api/search', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        query: 'rag',
        searchType: 'semantic',
        limit: 10
      })
    });
    
    if (response.ok) {
      const data = await response.json();
      console.log(`Search for "rag" returned ${data.results.length} results:`);
      
      data.results.forEach((result, index) => {
        console.log(`${index + 1}. ${result.title}`);
        console.log(`   URL: ${result.url}`);
        
        // Check if this is the problematic ArXiv URL
        if (result.url.includes('2024.rag.advances')) {
          console.log(`   ⚠️  This should have been filtered out!`);
        } else {
          console.log(`   βœ… Valid URL`);
        }
        console.log('');
      });
      
    } else {
      console.log('❌ Search request failed');
    }
    
  } catch (error) {
    console.log('❌ Search test failed:', error.message);
  }
  
  console.log('🎯 Enhanced Validation Test Complete!');
}

testEnhancedValidation();