Spaces:

pablo-sampaio
/

futeboy

Sleeping

App Files Files Community

futeboy / match_info_crawler.py

Pablo Sampaio

Merge branch 'main' of https://huggingface.co/spaces/pablo-sampaio/futeboy into main

2369060 3 months ago

raw

history blame contribute delete

No virus

4.92 kB

	import datetime
	from datetime import timedelta
	import pytz

	from bs4 import BeautifulSoup

	import requests
	from datetime import datetime, timedelta

	import os
	import re

	import google.generativeai as genai


	from dotenv import load_dotenv, find_dotenv

	load_dotenv(find_dotenv()) # should load the GOOGLE_API_KEY

	genai.configure()


	MODEL_VERSION = "gemini-1.5-flash"
	#MODEL_VERSION = "gemini-1.0-pro"

	sao_paulo_timezone = pytz.timezone('America/Sao_Paulo')


	SYSTEM_PROMPT = '''You will extract soccer match information from the text extracted from a html page,
	and you have to output the matches in this format:
	[TEAM HOME] [SCORE HOME] x [SCORE AWAY] [TEAM AWAY] - [CHAMPIONSHIP NAME] - [START TIME / MATCH TIME]

	Regarding [START TIME / MATCH TIME]:
	- if the match has already started, report the elapsed time in the match
	- if it hasn't started, write 'not started'
	- if the match has finished, report 'finished'

	Additional instructions that you must follow:
	- ignore matches for youth (under-20) and women leagues
	- but report friendly matches, specially between national teams

	Example output:
	```
	PSG 0 x 1 Borussia Dortmund - Champions League - finished
	Palmeiras 0 x 2 Atletico Paranaense - Campeonato Brasileiro - Série A - finished
	```
	'''

	if MODEL_VERSION != "gemini-1.0-pro":
	EXTRACTOR_MODEL = genai.GenerativeModel(MODEL_VERSION,
	system_instruction=SYSTEM_PROMPT) # TODO: setar uma temperatura bem baixa!
	else:
	EXTRACTOR_MODEL = genai.GenerativeModel(MODEL_VERSION)


	START_MAIS_POP_SECTION = "<h3 class='match-list_league-name'>MAIS POPULARES AGORA</h3>"
	END_MAIS_POP_SECTION = "<h3 class='match-list_league-name'"



	def remove_between(main_string, key1, key2, to_replace=""):
	pattern = f"{key1}(.*?){key2}"
	return re.sub(pattern, to_replace, main_string, flags=re.DOTALL)


	def get_matches_info(date_str: str):
	'''Returns matches info from all major leagues, including date and time in Brazilian timezone.
	Parameters:
	- date_str parameter is either 'yesterday', 'today' or 'tomorrow', in Brazilian timezone.
	Returns:
	- String with one matche per line; or empty string if the service is not available now.
	'''
	assert date_str in ['yesterday', 'today', 'tomorrow'], 'Invalid date_str parameter'
	CURRENT_DATE = datetime.now(sao_paulo_timezone)

	if date_str == 'yesterday':
	link = 'https://www.placardefutebol.com.br/jogos-de-ontem'
	date = CURRENT_DATE - timedelta(days=1)
	type = 'complete'
	elif date_str == 'tomorrow':
	link = 'https://www.placardefutebol.com.br/jogos-de-amanha'
	date = CURRENT_DATE + timedelta(days=1)
	type = 'incomplete'
	else:
	link = 'https://www.placardefutebol.com.br/jogos-de-hoje'
	date = CURRENT_DATE
	type = 'incomplete'

	# arquivo com nome igual ao 'date' desejado
	filename = date.strftime('%Y-%m-%d') + f'-{type}.txt'

	if os.path.exists(filename):
	with open(filename, 'r', encoding='utf8') as file:
	return file.read()

	# download the page
	try:
	page_raw = requests.get(link)
	page = page_raw.content.decode('utf-8')
	except:
	print('Error in downloading the page', link)
	return ""

	page = remove_between(page, START_MAIS_POP_SECTION, END_MAIS_POP_SECTION, END_MAIS_POP_SECTION)

	parser = BeautifulSoup(page, 'html.parser')
	page = parser.get_text()

	plain_page = page
	plain_page = plain_page[:plain_page.rfind('[+] Mais campeonatos')]
	plain_page = plain_page.replace('Ver tabela e classificação', '')

	plain_page = remove_between(plain_page, "Placar ao vivo de", "dos jogos de hoje.")
	plain_page = remove_between(plain_page, "Aplicativo de", "Meu Mundial de Clubes")

	new_text = plain_page
	old_text = ''
	while new_text != old_text:
	old_text = new_text
	new_text = old_text.replace('\n\n', '\n')
	plain_page = new_text

	#print("PROCESSING:\n", plain_page)

	try:
	if MODEL_VERSION != "gemini-1.0-pro":
	#response = EXTRACTOR_MODEL.generate_content(plain_page)
	response = EXTRACTOR_MODEL.generate_content("HTML PAGE TO BE PARSED: \n\n" + plain_page)
	else:
	prompt = "INSTRUCTIONS: \n\n" + SYSTEM_PROMPT + "\n\nHTML PAGE TO BE PARSED: \n\n" + plain_page
	response = EXTRACTOR_MODEL.generate_content(prompt)
	except Exception as e:
	print('Error in generating content:')
	print(e)
	return ""

	text_out = response.text
	#text_out = text_out.replace('```json', '').replace('```', '')
	text_out = text_out.replace('```', '')

	# save the file
	with open(filename, 'w', encoding='utf8') as file:
	file.write(text_out)

	return text_out



	if __name__ == '__main__':
	matches_info = get_matches_info('yesterday')
	print(matches_info)