patrickacraig commited on
Commit
709e431
·
1 Parent(s): 5566cb9

some improvements

Browse files
Files changed (2) hide show
  1. app.py +66 -64
  2. web_ui.py +57 -33
app.py CHANGED
@@ -1,80 +1,82 @@
1
  from firecrawl import FirecrawlApp
2
  import os
3
  import time
 
4
  from dotenv import load_dotenv
5
  from urllib.parse import urlparse
6
 
7
-
8
  load_dotenv()
9
 
10
  base_url = os.getenv('BASE_URL')
11
-
12
- def map_website(url):
13
- # Initialize the Firecrawl application with the API key
14
- app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
15
-
16
- # Use the /map endpoint to get all URLs from the website
17
- map_status = app.map_url(url)
18
-
19
- # Check if the mapping was successful
20
- if isinstance(map_status, list):
21
- return map_status
22
- else:
23
- print("Failed to map the website:", map_status)
24
- return []
25
-
26
- def scrape_url(url):
27
- # Initialize the Firecrawl application with the API key
28
- app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
29
-
30
- # Use the /scrape endpoint to scrape the URL
31
- scrape_status = app.scrape_url(url)
32
-
33
- # Print the scrape_status to understand its structure
34
- print(f"Scrape status for {url}: {scrape_status}")
35
-
36
- # Check if the scraping was successful
37
- if 'markdown' in scrape_status:
38
- return scrape_status['markdown']
39
- else:
40
- print(f"Failed to scrape {url}: {scrape_status}")
41
  return ""
42
 
43
- def scrape_all_urls(base_url):
44
- # Map the URLs
45
- urls = map_website(base_url)
46
-
47
- # Parse the base URL to get the domain without 'www' and scheme
48
- parsed_url = urlparse(base_url)
49
- domain = parsed_url.netloc.replace("www.", "")
50
-
51
- # Create the directory if it doesn't exist
52
- os.makedirs('scraped_documentation', exist_ok=True)
53
-
54
- # Generate the output file name and save location
55
- output_file = os.path.join('scraped_documentation', f"{domain}.md")
56
-
57
- # Open the output file in write mode
58
- with open(output_file, 'w', encoding='utf-8') as md_file:
59
- # Iterate over the URLs
60
- for i, url in enumerate(urls):
61
- # Print the URL being scraped
62
- print(f"Scraping {url} ({i+1}/{len(urls)})")
63
-
64
- # Scrape the URL
65
- markdown_content = scrape_url(url)
66
-
67
- # Write the scraped content to the file
68
- md_file.write(f"# {url}\n\n")
69
- md_file.write(markdown_content)
70
- md_file.write("\n\n---\n\n")
71
 
72
- # Rate limiting: 10 scrapes per minute
73
- if os.getenv('LIMIT_RATE') == 'True':
74
- if (i + 1) % 10 == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  print("Rate limit reached, waiting for 60 seconds...")
76
  time.sleep(60)
77
 
 
 
 
 
78
  if __name__ == "__main__":
79
-
80
- scrape_all_urls(base_url)
 
 
 
 
 
1
  from firecrawl import FirecrawlApp
2
  import os
3
  import time
4
+ import asyncio
5
  from dotenv import load_dotenv
6
  from urllib.parse import urlparse
7
 
 
8
  load_dotenv()
9
 
10
  base_url = os.getenv('BASE_URL')
11
+ api_key = os.getenv('FIRECRAWL_API_KEY')
12
+ limit_rate = os.getenv('LIMIT_RATE', 'False').lower() == 'true'
13
+
14
+ # Get Firecrawl App instance
15
+ def get_firecrawl_app(api_key):
16
+ return FirecrawlApp(api_key=api_key)
17
+
18
+ # Asynchronous scrape URL
19
+ async def async_scrape_url(app, url):
20
+ try:
21
+ scrape_status = app.scrape_url(url)
22
+ print(f"Scrape status for {url}: {scrape_status}")
23
+ if 'markdown' in scrape_status:
24
+ return scrape_status['markdown']
25
+ else:
26
+ print(f"Failed to scrape {url}: {scrape_status}")
27
+ return ""
28
+ except Exception as e:
29
+ print(f"Error scraping {url}: {e}")
 
 
 
 
 
 
 
 
 
 
 
30
  return ""
31
 
32
+ # Synchronously map website URLs
33
+ def map_website(app, url):
34
+ try:
35
+ map_status = app.map_url(url)
36
+ if isinstance(map_status, list):
37
+ return map_status
38
+ else:
39
+ print("Failed to map the website:", map_status)
40
+ return []
41
+ except Exception as e:
42
+ print(f"Error mapping website {url}: {e}")
43
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Asynchronously scrape all URLs
46
+ def scrape_all_urls(base_url, api_key, limit_rate):
47
+ async def scrape_process():
48
+ app = get_firecrawl_app(api_key)
49
+ urls = map_website(app, base_url)
50
+ if not urls:
51
+ print("No URLs found. Please check if the base URL is correct.")
52
+ return
53
+
54
+ parsed_url = urlparse(base_url)
55
+ domain = parsed_url.netloc.replace("www.", "")
56
+ os.makedirs('scraped_documentation', exist_ok=True)
57
+ output_file = os.path.join('scraped_documentation', f"{domain}.md")
58
+
59
+ with open(output_file, 'w', encoding='utf-8') as md_file:
60
+ for i, url in enumerate(urls):
61
+ print(f"Scraping {url} ({i+1}/{len(urls)})")
62
+ markdown_content = await async_scrape_url(app, url)
63
+ md_file.write(f"# {url}\n\n")
64
+ md_file.write(markdown_content)
65
+ md_file.write("\n\n---\n\n")
66
+
67
+ # Rate limiting: 10 scrapes per minute
68
+ if limit_rate and (i + 1) % 10 == 0:
69
  print("Rate limit reached, waiting for 60 seconds...")
70
  time.sleep(60)
71
 
72
+ print(f"Scraping completed. Output saved to {output_file}")
73
+
74
+ asyncio.run(scrape_process())
75
+
76
  if __name__ == "__main__":
77
+ if not base_url:
78
+ print("Error: BASE_URL not specified in environment variables.")
79
+ elif not api_key:
80
+ print("Error: FIRECRAWL_API_KEY not specified in environment variables.")
81
+ else:
82
+ scrape_all_urls(base_url, api_key, limit_rate)
web_ui.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import time
 
3
  from dotenv import load_dotenv
4
  from urllib.parse import urlparse
5
  from firecrawl import FirecrawlApp
@@ -7,62 +8,81 @@ import gradio as gr
7
 
8
  load_dotenv()
9
 
10
- def map_website(url, api_key):
11
- app = FirecrawlApp(api_key=api_key)
12
- map_status = app.map_url(url)
13
- if isinstance(map_status, list):
14
- return map_status
15
- else:
16
- print("Failed to map the website:", map_status)
17
- return []
18
 
19
- def scrape_url(url, api_key):
20
- app = FirecrawlApp(api_key=api_key)
21
- scrape_status = app.scrape_url(url)
22
- print(f"Scrape status for {url}: {scrape_status}")
23
- if 'markdown' in scrape_status:
24
- return scrape_status['markdown']
25
- else:
26
- print(f"Failed to scrape {url}: {scrape_status}")
 
 
 
27
  return ""
28
 
29
- def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
30
- urls = map_website(base_url, api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  parsed_url = urlparse(base_url)
32
  domain = parsed_url.netloc.replace("www.", "")
33
  os.makedirs('scraped_documentation', exist_ok=True)
34
  output_file = os.path.join('scraped_documentation', f"{domain}.md")
35
-
36
  with open(output_file, 'w', encoding='utf-8') as md_file:
37
  for i, url in enumerate(progress.tqdm(urls)):
38
  progress(i / len(urls), f"Scraping {url}")
39
- markdown_content = scrape_url(url, api_key)
40
  md_file.write(f"# {url}\n\n")
41
  md_file.write(markdown_content)
42
  md_file.write("\n\n---\n\n")
43
- if limit_rate:
44
- if (i + 1) % 10 == 0:
45
- time.sleep(60)
46
-
47
  return f"Scraping completed. Output saved to {output_file}"
48
 
49
  def count_urls(base_url, api_key):
50
  if not api_key:
51
  return "Please enter your Firecrawl API key first."
52
- urls = map_website(base_url, api_key)
53
- return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
 
 
 
 
54
 
55
- def gradio_scrape(base_url, api_key, limit_rate):
56
  if not api_key:
57
  return "Please enter your Firecrawl API key."
58
  if not base_url:
59
  return "Please enter a base URL to scrape."
60
- return scrape_all_urls(base_url, api_key, limit_rate)
61
 
62
  with gr.Blocks() as iface:
63
  gr.Markdown("# Docs Scraper")
64
- gr.Markdown("## To map and scrape all URLs from a given website using the Firecrawl API, enter a base URL to scrape, your Firecrawl API key, and choose whether to limit the rate of scraping.")
65
- gr.Markdown("Scraped content is saved into a markdown file named after the domain of the base URL, making it easy to reference and utilize. This can be particularly useful for AI code editors that need to gather context from various types of websites. By scraping the content, the AI can analyze and understand the structure and information provided, which can enhance its ability to offer accurate code suggestions and improvements.")
 
 
 
66
  gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
67
 
68
  with gr.Row():
@@ -74,7 +94,8 @@ with gr.Blocks() as iface:
74
  info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
75
  )
76
 
77
- gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process. The progress and file location will be displayed in the textbox labeled 'Output'.")
 
78
  with gr.Row():
79
  count_button = gr.Button("Count URLs")
80
  url_count = gr.Textbox(label="URL Count")
@@ -83,8 +104,11 @@ with gr.Blocks() as iface:
83
  scrape_button = gr.Button("Scrape URLs")
84
  output = gr.Textbox(label="Output", elem_id="output_textbox")
85
 
86
- gr.Markdown("#### Note: The free tier of the Firecrawl API allows for 500 credits per month. If you need to scrape more, you can upgrade to a paid plan. The 'Count URLs' button may not work as expected if the base URL is not correctly specified or if the API key is invalid. Always ensure the base URL is correct and the API key is valid before proceeding.")
87
-
 
 
 
88
 
89
  count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
90
  scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])
 
1
  import os
2
  import time
3
+ import asyncio
4
  from dotenv import load_dotenv
5
  from urllib.parse import urlparse
6
  from firecrawl import FirecrawlApp
 
8
 
9
  load_dotenv()
10
 
11
+ def get_firecrawl_app(api_key):
12
+ return FirecrawlApp(api_key=api_key)
 
 
 
 
 
 
13
 
14
+ async def async_scrape_url(app, url):
15
+ try:
16
+ scrape_status = app.scrape_url(url)
17
+ print(f"Scrape status for {url}: {scrape_status}")
18
+ if 'markdown' in scrape_status:
19
+ return scrape_status['markdown']
20
+ else:
21
+ print(f"Failed to scrape {url}: {scrape_status}")
22
+ return ""
23
+ except Exception as e:
24
+ print(f"Error scraping {url}: {e}")
25
  return ""
26
 
27
+ def map_website(app, url):
28
+ try:
29
+ map_status = app.map_url(url)
30
+ if isinstance(map_status, list):
31
+ return map_status
32
+ else:
33
+ print("Failed to map the website:", map_status)
34
+ return []
35
+ except Exception as e:
36
+ print(f"Error mapping website {url}: {e}")
37
+ return []
38
+
39
+ async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
40
+ app = get_firecrawl_app(api_key)
41
+ urls = map_website(app, base_url)
42
+ if not urls:
43
+ return "No URLs found. Please check if the base URL is correct."
44
+
45
  parsed_url = urlparse(base_url)
46
  domain = parsed_url.netloc.replace("www.", "")
47
  os.makedirs('scraped_documentation', exist_ok=True)
48
  output_file = os.path.join('scraped_documentation', f"{domain}.md")
49
+
50
  with open(output_file, 'w', encoding='utf-8') as md_file:
51
  for i, url in enumerate(progress.tqdm(urls)):
52
  progress(i / len(urls), f"Scraping {url}")
53
+ markdown_content = await async_scrape_url(app, url)
54
  md_file.write(f"# {url}\n\n")
55
  md_file.write(markdown_content)
56
  md_file.write("\n\n---\n\n")
57
+ if limit_rate and (i + 1) % 10 == 0:
58
+ time.sleep(60)
59
+
 
60
  return f"Scraping completed. Output saved to {output_file}"
61
 
62
  def count_urls(base_url, api_key):
63
  if not api_key:
64
  return "Please enter your Firecrawl API key first."
65
+ app = get_firecrawl_app(api_key)
66
+ urls = map_website(app, base_url)
67
+ if urls:
68
+ return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
69
+ else:
70
+ return "No URLs found. Please check the base URL or API key."
71
 
72
+ async def gradio_scrape(base_url, api_key, limit_rate):
73
  if not api_key:
74
  return "Please enter your Firecrawl API key."
75
  if not base_url:
76
  return "Please enter a base URL to scrape."
77
+ return await scrape_all_urls(base_url, api_key, limit_rate)
78
 
79
  with gr.Blocks() as iface:
80
  gr.Markdown("# Docs Scraper")
81
+ gr.Markdown("""
82
+ ## Map and Scrape Website URLs with Firecrawl API
83
+ Enter a base URL, your Firecrawl API key, and choose whether to limit the scraping rate.
84
+ Scraped content will be saved as a markdown file named after the domain.
85
+ """)
86
  gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
87
 
88
  with gr.Row():
 
94
  info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
95
  )
96
 
97
+ gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process.")
98
+
99
  with gr.Row():
100
  count_button = gr.Button("Count URLs")
101
  url_count = gr.Textbox(label="URL Count")
 
104
  scrape_button = gr.Button("Scrape URLs")
105
  output = gr.Textbox(label="Output", elem_id="output_textbox")
106
 
107
+ gr.Markdown("""
108
+ #### Note:
109
+ The free tier of the Firecrawl API allows for 500 credits per month.
110
+ If you need to scrape more, consider upgrading to a paid plan.
111
+ """)
112
 
113
  count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
114
  scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])