Spaces:
Sleeping
Sleeping
File size: 4,922 Bytes
627753f c5d9719 627753f 82b1a2e 627753f 82b1a2e 627753f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import datetime
from datetime import timedelta
import pytz
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import os
import re
import google.generativeai as genai
if 'GOOGLE_API_KEY' in os.environ:
GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
else:
with open('KEY_GOOGLE_AI', 'r') as file:
GOOGLE_API_KEY = file.read().replace('\n', '')
genai.configure(api_key=GOOGLE_API_KEY)
USE_BEST_EXTRACTOR = True
sao_paulo_timezone = pytz.timezone('America/Sao_Paulo')
#CURRENT_DATE = datetime.now(sao_paulo_timezone)
SYSTEM_PROMPT = '''You will extract soccer match information from the text extracted from a html page,
and you have to output the matches in this format:
[TEAM HOME] [SCORE HOME] x [SCORE AWAY] [TEAM AWAY] - [CHAMPIONSHIP NAME] - [START TIME / MATCH TIME]
Regarding [START TIME / MATCH TIME]:
- if the match has already started, report the elapsed time in the match
- if it hasn't started write 'not started'
- if the match has finished, report 'finished'
More information:
- ignore matches for youth (under-20) and women leagues
- example output:
```
PSG 0 x 1 Borussia Dortmund - Champions League - finished
Palmeiras 0 x 2 Atletico Paranaense - Campeonato Brasileiro - Série A - finished
```
'''
if USE_BEST_EXTRACTOR:
EXTRACTOR_MODEL = genai.GenerativeModel('gemini-1.5-pro-latest',
system_instruction=SYSTEM_PROMPT) # TODO: setar uma temperatura bem baixa!
else:
EXTRACTOR_MODEL = genai.GenerativeModel('gemini-1.0-pro')
START_MAIS_POP_SECTION = "<h3 class='match-list_league-name'>MAIS POPULARES AGORA</h3>"
END_MAIS_POP_SECTION = "<h3 class='match-list_league-name'"
def remove_between(main_string, key1, key2, to_replace=""):
pattern = f"{key1}(.*?){key2}"
return re.sub(pattern, to_replace, main_string, flags=re.DOTALL)
def get_matches_info(date_str: str):
'''Returns matches info from all major leagues, including date and time in Brazilian timezone.
Parameters:
- date_str parameter is either 'yesterday', 'today' or 'tomorrow', in Brazilian timezone.
Returns:
- String with one matche per line; or empty string if the service is not available now.
'''
assert date_str in ['yesterday', 'today', 'tomorrow'], 'Invalid date_str parameter'
CURRENT_DATE = datetime.now(sao_paulo_timezone)
if date_str == 'yesterday':
link = 'https://www.placardefutebol.com.br/jogos-de-ontem'
date = CURRENT_DATE - timedelta(days=1)
type = 'complete'
elif date_str == 'tomorrow':
link = 'https://www.placardefutebol.com.br/jogos-de-amanha'
date = CURRENT_DATE + timedelta(days=1)
type = 'incomplete'
else:
link = 'https://www.placardefutebol.com.br/jogos-de-hoje'
date = CURRENT_DATE
type = 'incomplete'
# arquivo com nome igual ao 'date' desejado
filename = date.strftime('%Y-%m-%d') + f'-{type}.txt'
if os.path.exists(filename):
with open(filename, 'r', encoding='utf8') as file:
return file.read()
# download the page
try:
page_raw = requests.get(link)
page = page_raw.content.decode('utf-8')
except:
print('Error in downloading the page', link)
return ""
page = remove_between(page, START_MAIS_POP_SECTION, END_MAIS_POP_SECTION, END_MAIS_POP_SECTION)
parser = BeautifulSoup(page, 'html.parser')
page = parser.get_text()
plain_page = page
plain_page = plain_page[:plain_page.rfind('[+] Mais campeonatos')]
plain_page = plain_page.replace('Ver tabela e classificação', '')
plain_page = remove_between(plain_page, "Placar ao vivo de", "dos jogos de hoje.")
plain_page = remove_between(plain_page, "Aplicativo de", "Meu Mundial de Clubes")
new_text = plain_page
old_text = ''
while new_text != old_text:
old_text = new_text
new_text = old_text.replace('\n\n', '\n')
plain_page = new_text
#print("PROCESSING:\n", plain_page)
try:
if USE_BEST_EXTRACTOR:
#response = EXTRACTOR_MODEL.generate_content(plain_page)
response = EXTRACTOR_MODEL.generate_content("HTML PAGE TO BE PARSED: \n\n" + plain_page)
else:
prompt = "INSTRUCTIONS: \n\n" + SYSTEM_PROMPT + "\n\nHTML PAGE TO BE PARSED: \n\n" + plain_page
response = EXTRACTOR_MODEL.generate_content(prompt)
except Exception as e:
print('Error in generating content:')
print(e)
return ""
text_out = response.text
#text_out = text_out.replace('```json', '').replace('```', '')
text_out = text_out.replace('```', '')
# save the file
with open(filename, 'w', encoding='utf8') as file:
file.write(text_out)
return text_out
if __name__ == '__main__':
matches_info = get_matches_info('yesterday')
print(matches_info)
|