Spaces:
Sleeping
Sleeping
import datetime | |
from datetime import timedelta | |
import pytz | |
from bs4 import BeautifulSoup | |
import requests | |
from datetime import datetime, timedelta | |
import os | |
import re | |
import google.generativeai as genai | |
with open('KEY_GOOGLE_AI', 'r') as file: | |
GOOGLE_API_KEY = file.read().replace('\n', '') | |
genai.configure(api_key=GOOGLE_API_KEY) | |
USE_BEST_EXTRACTOR = True | |
sao_paulo_timezone = pytz.timezone('America/Sao_Paulo') | |
CURRENT_DATE = datetime.now(sao_paulo_timezone) | |
SYSTEM_PROMPT = '''You will extract soccer match information from the text extracted from a html page, | |
and you have to output the matches in this format: | |
[TEAM HOME] [SCORE HOME] x [SCORE AWAY] [TEAM AWAY] - [CHAMPIONSHIP NAME] - [START TIME / MATCH TIME] | |
Regarding [START TIME / MATCH TIME]: | |
- if the match has already started, report the elapsed time in the match | |
- if it hasn't started write 'not started' | |
- if the match has finished, report 'finished' | |
More information: | |
- ignore matches for youth (under-20) and women leagues | |
- example output: | |
``` | |
PSG 0 x 1 Borussia Dortmund - Champions League - finished | |
Palmeiras 0 x 2 Atletico Paranaense - Campeonato Brasileiro - Série A - finished | |
``` | |
''' | |
if USE_BEST_EXTRACTOR: | |
EXTRACTOR_MODEL = genai.GenerativeModel('gemini-1.5-pro-latest', | |
system_instruction=SYSTEM_PROMPT) # TODO: setar uma temperatura bem baixa! | |
else: | |
EXTRACTOR_MODEL = genai.GenerativeModel('gemini-1.0-pro') | |
START_MAIS_POP_SECTION = "<h3 class='match-list_league-name'>MAIS POPULARES AGORA</h3>" | |
END_MAIS_POP_SECTION = "<h3 class='match-list_league-name'" | |
def remove_between(main_string, key1, key2, to_replace=""): | |
pattern = f"{key1}(.*?){key2}" | |
return re.sub(pattern, to_replace, main_string, flags=re.DOTALL) | |
def get_matches_info(date_str: str): | |
'''Returns matches info from all major leagues, including date and time in Brazilian timezone. | |
Parameters: | |
- date_str parameter is either 'yesterday', 'today' or 'tomorrow', in Brazilian timezone. | |
Returns: | |
- String with one matche per line; or empty string if the service is not available now. | |
''' | |
assert date_str in ['yesterday', 'today', 'tomorrow'], 'Invalid date_str parameter' | |
if date_str == 'yesterday': | |
link = 'https://www.placardefutebol.com.br/jogos-de-ontem' | |
date = CURRENT_DATE - timedelta(days=1) | |
type = 'complete' | |
elif date_str == 'tomorrow': | |
link = 'https://www.placardefutebol.com.br/jogos-de-amanha' | |
date = CURRENT_DATE + timedelta(days=1) | |
type = 'incomplete' | |
else: | |
link = 'https://www.placardefutebol.com.br/jogos-de-hoje' | |
date = CURRENT_DATE | |
type = 'incomplete' | |
# arquivo com nome igual ao 'date' desejado | |
filename = date.strftime('%Y-%m-%d') + f'-{type}.txt' | |
if os.path.exists(filename): | |
with open(filename, 'r', encoding='utf8') as file: | |
return file.read() | |
# download the page | |
try: | |
page_raw = requests.get(link) | |
page = page_raw.content.decode('utf-8') | |
except: | |
print('Error in downloading the page', link) | |
return "" | |
page = remove_between(page, START_MAIS_POP_SECTION, END_MAIS_POP_SECTION, END_MAIS_POP_SECTION) | |
parser = BeautifulSoup(page, 'html.parser') | |
page = parser.get_text() | |
plain_page = page | |
plain_page = plain_page[:plain_page.rfind('[+] Mais campeonatos')] | |
plain_page = plain_page.replace('Ver tabela e classificação', '') | |
plain_page = remove_between(plain_page, "Placar ao vivo de", "dos jogos de hoje.") | |
plain_page = remove_between(plain_page, "Aplicativo de", "Meu Mundial de Clubes") | |
new_text = plain_page | |
old_text = '' | |
while new_text != old_text: | |
old_text = new_text | |
new_text = old_text.replace('\n\n', '\n') | |
plain_page = new_text | |
#print("PROCESSING:\n", plain_page) | |
try: | |
if USE_BEST_EXTRACTOR: | |
#response = EXTRACTOR_MODEL.generate_content(plain_page) | |
response = EXTRACTOR_MODEL.generate_content("HTML PAGE TO BE PARSED: \n\n" + plain_page) | |
else: | |
prompt = "INSTRUCTIONS: \n\n" + SYSTEM_PROMPT + "\n\nHTML PAGE TO BE PARSED: \n\n" + plain_page | |
response = EXTRACTOR_MODEL.generate_content(prompt) | |
except Exception as e: | |
print('Error in generating content:') | |
print(e) | |
return "" | |
text_out = response.text | |
#text_out = text_out.replace('```json', '').replace('```', '') | |
text_out = text_out.replace('```', '') | |
# save the file | |
with open(filename, 'w', encoding='utf8') as file: | |
file.write(text_out) | |
return text_out | |
if __name__ == '__main__': | |
matches_info = get_matches_info('yesterday') | |
print(matches_info) | |