broadfield-dev commited on
Commit
bce38d2
·
verified ·
1 Parent(s): de83290

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -124
app.py CHANGED
@@ -1,13 +1,13 @@
1
- from flask import Flask, request, render_template
 
 
2
  import feedparser
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import urllib.parse
6
- import logging
7
- from typing import List, Dict
8
  import time
 
9
 
10
- # Configure logging
11
  logging.basicConfig(
12
  level=logging.INFO,
13
  format='%(asctime)s - %(levelname)s - %(message)s',
@@ -18,143 +18,100 @@ logging.basicConfig(
18
  )
19
  logger = logging.getLogger(__name__)
20
 
21
- app = Flask(__name__)
22
-
23
- # List of Craigslist cities (partial list for brevity; expand as needed)
24
- CRAIGSLIST_CITIES = [
25
- "all", "newyork", "losangeles", "chicago", "houston", "phoenix", "philadelphia",
26
- "sanantonio", "sandiego", "dallas", "sanjose", "austin", "jacksonville",
27
- "sanfrancisco", "columbus", "seattle", "denver", "boston", "miami", "atlanta"
28
- ]
29
-
30
- def search_craigslist(query: str, city: str) -> List[Dict]:
31
- """
32
- Search Craigslist for a query in a specific city or all cities.
33
- Returns a list of posts with title, link, description, and date.
34
- """
35
- start_time = time.time()
36
- logger.info(f"Starting search for query: '{query}' in city: '{city}'")
37
-
38
- posts = []
39
- query = urllib.parse.quote(query.strip()) # URL-encode the query
40
-
41
- try:
42
- if city == "all":
43
- # Search across multiple cities
44
- for city_name in CRAIGSLIST_CITIES[1:]: # Skip "all"
45
- url = f"https://{city_name}.craigslist.org/search/sss?query={query}"
46
- logger.debug(f"Fetching URL: {url}")
47
- feed = fetch_feed_with_retry(url)
48
- posts.extend(parse_feed(feed, city_name))
49
- else:
50
- # Search in a specific city
51
- url = f"https://{city}.craigslist.org/search/sss?query={query}"
52
- logger.debug(f"Fetching URL: {url}")
53
- feed = fetch_feed_with_retry(url)
54
- posts.extend(parse_feed(feed, city))
55
-
56
- logger.info(f"Search completed in {time.time() - start_time:.2f} seconds. Found {len(posts)} posts")
57
- return posts
58
-
59
- except Exception as e:
60
- logger.error(f"Error during search: {str(e)}")
61
- return []
62
-
63
  def fetch_feed_with_retry(url: str, retries: int = 3, delay: int = 2) -> feedparser.FeedParserDict:
64
  """
65
  Fetch RSS feed with retry mechanism to handle network issues.
 
66
  """
 
 
 
 
67
  for attempt in range(retries):
68
  try:
69
- feed = feedparser.parse(url)
 
 
 
 
 
 
 
 
 
 
70
  if feed.bozo:
71
  logger.warning(f"Feed parsing warning for {url}: {feed.bozo_exception}")
72
- return feed
 
 
 
 
 
 
 
 
 
73
  except Exception as e:
74
- logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
75
- if attempt < retries - 1:
76
- time.sleep(delay)
77
- logger.error(f"Failed to fetch feed after {retries} attempts: {url}")
 
 
 
78
  return feedparser.parse("") # Return empty feed on failure
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def parse_feed(feed: feedparser.FeedParserDict, city: str) -> List[Dict]:
81
  """
82
  Parse the RSS feed and extract relevant post information.
 
83
  """
84
  posts = []
85
  try:
86
- for entry in feed.entries:
87
- # Clean description using BeautifulSoup
88
- description = entry.get("description", "")
89
- soup = BeautifulSoup(description, "html.parser")
90
- clean_description = soup.get_text(strip=True)
91
- # Truncate description if necessary
92
- clean_description = clean_description[:200] + "..." if len(clean_description) > 200 else clean_description
93
-
94
- post = {
95
- "title": entry.get("title", "No title"),
96
- "link": entry.get("link", "#"),
97
- "description": clean_description,
98
- "date": entry.get("published", "No date"),
99
- "city": city.capitalize()
100
- }
101
- posts.append(post)
102
- logger.debug(f"Parsed {len(posts)} posts for city: {city}")
 
 
 
 
103
  return posts
104
 
105
  except Exception as e:
106
  logger.error(f"Error parsing feed for city {city}: {str(e)}")
107
- return []
108
-
109
- @app.route("/", methods=["GET", "POST"])
110
- def index():
111
- posts = []
112
- query = ""
113
- selected_city = "all"
114
-
115
- try:
116
- if request.method == "POST":
117
- query = request.form.get("query", "").strip()
118
- selected_city = request.form.get("city", "all")
119
-
120
- # Validate inputs
121
- if not query:
122
- logger.warning("Empty query received")
123
- return render_template(
124
- "index.html",
125
- posts=[],
126
- query="",
127
- cities=CRAIGSLIST_CITIES,
128
- selected_city=selected_city,
129
- error="Please enter a search query"
130
- )
131
-
132
- if selected_city not in CRAIGSLIST_CITIES:
133
- logger.warning(f"Invalid city selected: {selected_city}")
134
- selected_city = "all"
135
-
136
- logger.info(f"Processing POST request: query='{query}', city='{selected_city}'")
137
- posts = search_craigslist(query, selected_city)
138
-
139
- return render_template(
140
- "index.html",
141
- posts=posts,
142
- query=query,
143
- cities=CRAIGSLIST_CITIES,
144
- selected_city=selected_city
145
- )
146
-
147
- except Exception as e:
148
- logger.error(f"Error in index route: {str(e)}")
149
- return render_template(
150
- "index.html",
151
- posts=[],
152
- query=query,
153
- cities=CRAIGSLIST_CITIES,
154
- selected_city=selected_city,
155
- error="An error occurred while processing your request"
156
- )
157
-
158
- if __name__ == "__main__":
159
- logger.info("Starting Flask application")
160
- app.run(host="0.0.0.0", port=7860, debug=True)
 
1
+ import re
2
+ import logging
3
+ from typing import List, Dict
4
  import feedparser
5
  import requests
6
  from bs4 import BeautifulSoup
 
 
 
7
  import time
8
+ import urllib.parse
9
 
10
+ # Existing logging configuration (unchanged)
11
  logging.basicConfig(
12
  level=logging.INFO,
13
  format='%(asctime)s - %(levelname)s - %(message)s',
 
18
  )
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def fetch_feed_with_retry(url: str, retries: int = 3, delay: int = 2) -> feedparser.FeedParserDict:
22
  """
23
  Fetch RSS feed with retry mechanism to handle network issues.
24
+ Attempts to preprocess feed content to handle malformed XML.
25
  """
26
+ headers = {
27
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
28
+ }
29
+
30
  for attempt in range(retries):
31
  try:
32
+ # Fetch feed content using requests
33
+ logger.debug(f"Fetching URL (attempt {attempt + 1}): {url}")
34
+ response = requests.get(url, headers=headers, timeout=10)
35
+ response.raise_for_status()
36
+
37
+ # Preprocess feed content to clean invalid characters
38
+ feed_content = preprocess_feed_content(response.text)
39
+
40
+ # Parse the cleaned content with feedparser
41
+ feed = feedparser.parse(feed_content)
42
+
43
  if feed.bozo:
44
  logger.warning(f"Feed parsing warning for {url}: {feed.bozo_exception}")
45
+ # If feed is partially valid (has entries), continue; otherwise retry
46
+ if feed.entries:
47
+ return feed
48
+ else:
49
+ logger.warning(f"No valid entries in feed for {url}, retrying...")
50
+ else:
51
+ return feed
52
+
53
+ except requests.RequestException as e:
54
+ logger.error(f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
55
  except Exception as e:
56
+ logger.error(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
57
+
58
+ if attempt < retries - 1:
59
+ logger.info(f"Retrying after {delay} seconds...")
60
+ time.sleep(delay)
61
+
62
+ logger.error(f"Failed to fetch valid feed after {retries} attempts: {url}")
63
  return feedparser.parse("") # Return empty feed on failure
64
 
65
+ def preprocess_feed_content(content: str) -> str:
66
+ """
67
+ Clean feed content to remove invalid XML characters and fix common issues.
68
+ """
69
+ try:
70
+ # Remove invalid XML characters
71
+ invalid_xml = re.compile(u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
72
+ content = invalid_xml.sub('', content)
73
+
74
+ # Fix unescaped ampersands (common in Craigslist feeds)
75
+ content = re.sub(r'&(?![A-Za-z]+[0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;)', '&amp;', content)
76
+
77
+ # Ensure UTF-8 encoding
78
+ content = content.encode('utf-8', errors='ignore').decode('utf-8')
79
+
80
+ return content
81
+ except Exception as e:
82
+ logger.error(f"Error preprocessing feed content: {str(e)}")
83
+ return content # Return original content if preprocessing fails
84
+
85
  def parse_feed(feed: feedparser.FeedParserDict, city: str) -> List[Dict]:
86
  """
87
  Parse the RSS feed and extract relevant post information.
88
+ Skips invalid entries to handle malformed feed content.
89
  """
90
  posts = []
91
  try:
92
+ for index, entry in enumerate(feed.entries):
93
+ try:
94
+ # Clean description using BeautifulSoup
95
+ description = entry.get("description", "")
96
+ soup = BeautifulSoup(description, "html.parser")
97
+ clean_description = soup.get_text(strip=True)
98
+ clean_description = clean_description[:200] + "..." if len(clean_description) > 200 else clean_description
99
+
100
+ post = {
101
+ "title": entry.get("title", "No title"),
102
+ "link": entry.get("link", "#"),
103
+ "description": clean_description,
104
+ "date": entry.get("published", "No date"),
105
+ "city": city.capitalize()
106
+ }
107
+ posts.append(post)
108
+ except Exception as e:
109
+ logger.warning(f"Skipping invalid entry {index} in feed for city {city}: {str(e)}")
110
+ continue
111
+
112
+ logger.debug(f"Parsed {len(posts)} valid posts for city: {city}")
113
  return posts
114
 
115
  except Exception as e:
116
  logger.error(f"Error parsing feed for city {city}: {str(e)}")
117
+ return []