Spaces:
Sleeping
Sleeping
| # | |
| # | |
| # def is_likely_product_card(element, min_text_length=10): | |
| # """ | |
| # Determine if an element is likely to be a product card based on various heuristics. | |
| # """ | |
| # # 1. Check for common product card class/id patterns | |
| # identifier = element.get('class', []) + [element.get('id', '')] | |
| # product_patterns = ['product', 'item', 'card', 'goods', 'listing'] | |
| # if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier): | |
| # return True | |
| # | |
| # # 2. Check for price patterns | |
| # text_content = element.get_text() | |
| # price_patterns = [ | |
| # r'\$\d+\.?\d*', # USD | |
| # r'£\d+\.?\d*', # GBP | |
| # r'€\d+\.?\d*', # EUR | |
| # r'\d+\.?\d*\s*USD', | |
| # r'\d+\.?\d*\s*EUR' | |
| # ] | |
| # if any(re.search(pattern, text_content) for pattern in price_patterns): | |
| # return True | |
| # | |
| # # 3. Check for minimum text content (excluding whitespace) | |
| # clean_text = ' '.join(text_content.split()) | |
| # if len(clean_text) < min_text_length: | |
| # return False | |
| # | |
| # # 4. Check for typical product card elements | |
| # has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])) | |
| # | |
| # return has_title | |
| # | |
| # | |
| # def should_exclude_element(element): | |
| # # """ | |
| # # Check if an element should be excluded from consideration. | |
| # # """ | |
| # | |
| # # 1. Exclude common non-product sections | |
| # exclude_patterns = [ | |
| # 'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart', | |
| # 'search', 'pagination', 'sort', 'banner', 'ad', 'slider' | |
| # ] | |
| # | |
| # # Check class and id | |
| # element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split() | |
| # element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split() | |
| # | |
| # print(element_classes) | |
| # | |
| # for pattern in exclude_patterns: | |
| # if pattern in element_classes: | |
| # print(f"Excluded element due to class containing '{pattern}'") | |
| # return True | |
| # if pattern in element_id: | |
| # print(f"Excluded element due to id containing '{pattern}'") | |
| # return True | |
| # | |
| # return False | |