Spaces:

broadfield-dev
/

Craigslist_API

Sleeping

App Files Files Community

broadfield-dev commited on May 11

Commit

bce38d2

verified ·

1 Parent(s): de83290

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -124

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-from flask import Flask, request, render_template
 import feedparser
 import requests
 from bs4 import BeautifulSoup
-import urllib.parse
-import logging
-from typing import List, Dict
 import time
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s',
@@ -18,143 +18,100 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-app = Flask(__name__)
-# List of Craigslist cities (partial list for brevity; expand as needed)
-CRAIGSLIST_CITIES = [
-    "all", "newyork", "losangeles", "chicago", "houston", "phoenix", "philadelphia",
-    "sanantonio", "sandiego", "dallas", "sanjose", "austin", "jacksonville",
-    "sanfrancisco", "columbus", "seattle", "denver", "boston", "miami", "atlanta"
-]
-def search_craigslist(query: str, city: str) -> List[Dict]:
-    """
-    Search Craigslist for a query in a specific city or all cities.
-    Returns a list of posts with title, link, description, and date.
-    """
-    start_time = time.time()
-    logger.info(f"Starting search for query: '{query}' in city: '{city}'")
-    posts = []
-    query = urllib.parse.quote(query.strip())  # URL-encode the query
-    try:
-        if city == "all":
-            # Search across multiple cities
-            for city_name in CRAIGSLIST_CITIES[1:]:  # Skip "all"
-                url = f"https://{city_name}.craigslist.org/search/sss?query={query}"
-                logger.debug(f"Fetching URL: {url}")
-                feed = fetch_feed_with_retry(url)
-                posts.extend(parse_feed(feed, city_name))
-        else:
-            # Search in a specific city
-            url = f"https://{city}.craigslist.org/search/sss?query={query}"
-            logger.debug(f"Fetching URL: {url}")
-            feed = fetch_feed_with_retry(url)
-            posts.extend(parse_feed(feed, city))
-        logger.info(f"Search completed in {time.time() - start_time:.2f} seconds. Found {len(posts)} posts")
-        return posts
-    except Exception as e:
-        logger.error(f"Error during search: {str(e)}")
-        return []
 def fetch_feed_with_retry(url: str, retries: int = 3, delay: int = 2) -> feedparser.FeedParserDict:
     """
     Fetch RSS feed with retry mechanism to handle network issues.
     """
     for attempt in range(retries):
         try:
-            feed = feedparser.parse(url)
             if feed.bozo:
                 logger.warning(f"Feed parsing warning for {url}: {feed.bozo_exception}")
-            return feed
         except Exception as e:
-            logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
-            if attempt < retries - 1:
-                time.sleep(delay)
-    logger.error(f"Failed to fetch feed after {retries} attempts: {url}")
     return feedparser.parse("")  # Return empty feed on failure
 def parse_feed(feed: feedparser.FeedParserDict, city: str) -> List[Dict]:
     """
     Parse the RSS feed and extract relevant post information.
     """
     posts = []
     try:
-        for entry in feed.entries:
-            # Clean description using BeautifulSoup
-            description = entry.get("description", "")
-            soup = BeautifulSoup(description, "html.parser")
-            clean_description = soup.get_text(strip=True)
-            # Truncate description if necessary
-            clean_description = clean_description[:200] + "..." if len(clean_description) > 200 else clean_description
-            post = {
-                "title": entry.get("title", "No title"),
-                "link": entry.get("link", "#"),
-                "description": clean_description,
-                "date": entry.get("published", "No date"),
-                "city": city.capitalize()
-            }
-            posts.append(post)
-        logger.debug(f"Parsed {len(posts)} posts for city: {city}")
         return posts
     except Exception as e:
         logger.error(f"Error parsing feed for city {city}: {str(e)}")
-        return []
-@app.route("/", methods=["GET", "POST"])
-def index():
-    posts = []
-    query = ""
-    selected_city = "all"
-    try:
-        if request.method == "POST":
-            query = request.form.get("query", "").strip()
-            selected_city = request.form.get("city", "all")
-            # Validate inputs
-            if not query:
-                logger.warning("Empty query received")
-                return render_template(
-                    "index.html",
-                    posts=[],
-                    query="",
-                    cities=CRAIGSLIST_CITIES,
-                    selected_city=selected_city,
-                    error="Please enter a search query"
-                )
-            if selected_city not in CRAIGSLIST_CITIES:
-                logger.warning(f"Invalid city selected: {selected_city}")
-                selected_city = "all"
-            logger.info(f"Processing POST request: query='{query}', city='{selected_city}'")
-            posts = search_craigslist(query, selected_city)
-        return render_template(
-            "index.html",
-            posts=posts,
-            query=query,
-            cities=CRAIGSLIST_CITIES,
-            selected_city=selected_city
-        )
-    except Exception as e:
-        logger.error(f"Error in index route: {str(e)}")
-        return render_template(
-            "index.html",
-            posts=[],
-            query=query,
-            cities=CRAIGSLIST_CITIES,
-            selected_city=selected_city,
-            error="An error occurred while processing your request"
-        )
-if __name__ == "__main__":
-    logger.info("Starting Flask application")
-    app.run(host="0.0.0.0", port=7860, debug=True)

+import re
+import logging
+from typing import List, Dict
 import feedparser
 import requests
 from bs4 import BeautifulSoup
 import time
+import urllib.parse
+# Existing logging configuration (unchanged)
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s',
 )
 logger = logging.getLogger(__name__)
 def fetch_feed_with_retry(url: str, retries: int = 3, delay: int = 2) -> feedparser.FeedParserDict:
     """
     Fetch RSS feed with retry mechanism to handle network issues.
+    Attempts to preprocess feed content to handle malformed XML.
     """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
     for attempt in range(retries):
         try:
+            # Fetch feed content using requests
+            logger.debug(f"Fetching URL (attempt {attempt + 1}): {url}")
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            # Preprocess feed content to clean invalid characters
+            feed_content = preprocess_feed_content(response.text)
+            # Parse the cleaned content with feedparser
+            feed = feedparser.parse(feed_content)
             if feed.bozo:
                 logger.warning(f"Feed parsing warning for {url}: {feed.bozo_exception}")
+                # If feed is partially valid (has entries), continue; otherwise retry
+                if feed.entries:
+                    return feed
+                else:
+                    logger.warning(f"No valid entries in feed for {url}, retrying...")
+            else:
+                return feed
+        except requests.RequestException as e:
+            logger.error(f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
         except Exception as e:
+            logger.error(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
+        if attempt < retries - 1:
+            logger.info(f"Retrying after {delay} seconds...")
+            time.sleep(delay)
+    logger.error(f"Failed to fetch valid feed after {retries} attempts: {url}")
     return feedparser.parse("")  # Return empty feed on failure
+def preprocess_feed_content(content: str) -> str:
+    """
+    Clean feed content to remove invalid XML characters and fix common issues.
+    """
+    try:
+        # Remove invalid XML characters
+        invalid_xml = re.compile(u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
+        content = invalid_xml.sub('', content)
+        # Fix unescaped ampersands (common in Craigslist feeds)
+        content = re.sub(r'&(?![A-Za-z]+[0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;)', '&amp;', content)
+        # Ensure UTF-8 encoding
+        content = content.encode('utf-8', errors='ignore').decode('utf-8')
+        return content
+    except Exception as e:
+        logger.error(f"Error preprocessing feed content: {str(e)}")
+        return content  # Return original content if preprocessing fails
 def parse_feed(feed: feedparser.FeedParserDict, city: str) -> List[Dict]:
     """
     Parse the RSS feed and extract relevant post information.
+    Skips invalid entries to handle malformed feed content.
     """
     posts = []
     try:
+        for index, entry in enumerate(feed.entries):
+            try:
+                # Clean description using BeautifulSoup
+                description = entry.get("description", "")
+                soup = BeautifulSoup(description, "html.parser")
+                clean_description = soup.get_text(strip=True)
+                clean_description = clean_description[:200] + "..." if len(clean_description) > 200 else clean_description
+                post = {
+                    "title": entry.get("title", "No title"),
+                    "link": entry.get("link", "#"),
+                    "description": clean_description,
+                    "date": entry.get("published", "No date"),
+                    "city": city.capitalize()
+                }
+                posts.append(post)
+            except Exception as e:
+                logger.warning(f"Skipping invalid entry {index} in feed for city {city}: {str(e)}")
+                continue
+        logger.debug(f"Parsed {len(posts)} valid posts for city: {city}")
         return posts
     except Exception as e:
         logger.error(f"Error parsing feed for city {city}: {str(e)}")
+        return []