Spaces:

Juneyy
/

PredictTemp24Hours

Running

PredictTemp24Hours / scrapeandappendforecastdata.py

Juneyy

Update scrapeandappendforecastdata.py

bb3d009 unverified 4 months ago

No virus

3.01 kB

	# -- coding: utf-8 --
	"""ScrapeandAppendForecastData.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1Rz0MA1Ooa6x8OlgE6Q2S6YI1eJCKuiSz
	"""

	import pandas as pd
	import matplotlib.pyplot as plt
	import keras
	import requests
	import pandas as pd
	import numpy as np
	from datetime import date
	from datetime import timedelta
	from bs4 import BeautifulSoup
	import csv

	"""
	Since the time doesn't return a date and the data is pulled in reverse chronological order
	"""


	def main():
	today = date.today() - timedelta(days=1)

	path = ''

	url = "https://www.localconditions.com/weather-alhambra-california/91801/past.php"
	file = requests.get(url).content
	soup = BeautifulSoup(file, "html.parser")

	headers = ['Time PST', 'Temp (F)', 'Humidity', "Dew Point", 'Barometer', 'Wind Speed (in HG)', 'Wind Direction',
	'Wind Gust (MPH)', '1hr. Precip / Rain Total (in.)', 'Snow Depth']

	dictionary_data = {}
	for key in headers:
	dictionary_data[key] = []

	"""
	all the data regarding the temperature have a table header

	each hour stamp is separated by a tr

	each column is separated by a td
	"""

	for i in range(1, len(soup.findAll('table'))):
	temp = soup.findAll('table')[i].findAll('tr')
	for j in range(1, len(temp) - 1):
	parse = temp[j].find_all("td")
	if len(parse) != 10:
	continue
	for index, (key) in enumerate(dictionary_data.keys()):
	if key == 'Time PST':
	dictionary_data[key].append("{} {}".format(today - timedelta(days=i), parse[index].text))
	else:
	dictionary_data[key].append(parse[index].text)

	pd_data = pd.DataFrame.from_dict(dictionary_data)
	pd_data = pd_data.drop(['Dew Point', 'Barometer', '1hr. Precip / Rain Total (in.)', 'Snow Depth', 'Wind Direction'],
	axis=1)

	new_headers = ['Time PST', 'Temp (F)', 'Humidity', 'Wind Speed (in HG)', 'Wind Gust (MPH)']

	for header in new_headers:
	pd_data = pd_data[pd_data[header] != '-']

	pd_data['Time PST'] = pd.to_datetime(pd_data['Time PST'])
	pd_data = pd_data.iloc[::-1]
	pd_data.reset_index(drop=True, inplace=True)
	pd_data['Temp (F)'] = pd_data['Temp (F)'].astype(int)
	pd_data['Humidity'] = pd_data['Humidity'].astype(int)
	pd_data['Wind Speed (in HG)'] = pd_data['Wind Speed (in HG)'].astype(float)
	pd_data['Wind Gust (MPH)'] = pd_data['Wind Gust (MPH)'].astype(float)

	pd_data.to_csv(path + 'data/' + 'weather_data.csv', mode='a', index=False, header=False)

	remove_dup = pd.read_csv(path + 'data/' + 'weather_data.csv')

	remove_dup = remove_dup.drop_duplicates(subset=['Time PST'])

	remove_dup = remove_dup.sort_values(by = ['Time PST'])
	remove_dup.to_csv(path + 'data/' + 'weather_data.csv', mode='w', index=False, header=True)


	if __name__ == "__main__":
	main()