File size: 4,922 Bytes
627753f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bb2add
 
 
627753f
4bb2add
627753f
 
2369060
 
627753f
 
 
 
 
 
 
 
 
 
4bb2add
627753f
 
4bb2add
627753f
4bb2add
 
 
627753f
 
 
 
 
 
2369060
 
 
627753f
2369060
627753f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82b1a2e
627753f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2369060
627753f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import datetime
from datetime import timedelta
import pytz

from bs4 import BeautifulSoup

import requests
from datetime import datetime, timedelta

import os
import re

import google.generativeai as genai


from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv()) # should load the GOOGLE_API_KEY

genai.configure()


MODEL_VERSION = "gemini-1.5-flash"
#MODEL_VERSION = "gemini-1.0-pro"

sao_paulo_timezone = pytz.timezone('America/Sao_Paulo')


SYSTEM_PROMPT = '''You will extract soccer match information from the text extracted from a html page, 
and you have to output the matches in this format:
[TEAM HOME] [SCORE HOME] x [SCORE AWAY] [TEAM AWAY] - [CHAMPIONSHIP NAME] - [START TIME / MATCH TIME]

Regarding [START TIME / MATCH TIME]:
- if the match has already started, report the elapsed time in the match
- if it hasn't started, write 'not started'
- if the match has finished, report 'finished'

Additional instructions that you must follow:
- ignore matches for youth (under-20) and women leagues
- but report friendly matches, specially between national teams

Example output:
```
PSG 0 x 1 Borussia Dortmund - Champions League - finished
Palmeiras 0 x 2 Atletico Paranaense - Campeonato Brasileiro - Série A - finished
```
'''

if MODEL_VERSION != "gemini-1.0-pro":
    EXTRACTOR_MODEL = genai.GenerativeModel(MODEL_VERSION, 
                                            system_instruction=SYSTEM_PROMPT)  # TODO: setar uma temperatura bem baixa!
else:
    EXTRACTOR_MODEL = genai.GenerativeModel(MODEL_VERSION)


START_MAIS_POP_SECTION = "<h3 class='match-list_league-name'>MAIS POPULARES AGORA</h3>"
END_MAIS_POP_SECTION = "<h3 class='match-list_league-name'"



def remove_between(main_string, key1, key2, to_replace=""):
    pattern = f"{key1}(.*?){key2}"
    return re.sub(pattern, to_replace, main_string, flags=re.DOTALL)


def get_matches_info(date_str: str):
    '''Returns matches info from all major leagues, including date and time in Brazilian timezone.
    Parameters:
    - date_str parameter is either 'yesterday', 'today' or 'tomorrow', in Brazilian timezone.
    Returns:
    - String with one matche per line; or empty string if the service is not available now.
    '''
    assert date_str in ['yesterday', 'today', 'tomorrow'], 'Invalid date_str parameter'
    CURRENT_DATE = datetime.now(sao_paulo_timezone)
    
    if date_str == 'yesterday':
        link = 'https://www.placardefutebol.com.br/jogos-de-ontem'
        date = CURRENT_DATE - timedelta(days=1)
        type = 'complete'
    elif date_str == 'tomorrow':
        link = 'https://www.placardefutebol.com.br/jogos-de-amanha'
        date = CURRENT_DATE + timedelta(days=1)
        type = 'incomplete'
    else:
        link = 'https://www.placardefutebol.com.br/jogos-de-hoje'
        date = CURRENT_DATE
        type = 'incomplete'
    
    # arquivo com nome igual ao 'date' desejado
    filename = date.strftime('%Y-%m-%d') + f'-{type}.txt'

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf8') as file:
            return file.read()

    # download the page
    try:
        page_raw = requests.get(link)
        page = page_raw.content.decode('utf-8')
    except:
        print('Error in downloading the page', link)
        return ""
    
    page = remove_between(page, START_MAIS_POP_SECTION, END_MAIS_POP_SECTION, END_MAIS_POP_SECTION)

    parser = BeautifulSoup(page, 'html.parser')
    page = parser.get_text()

    plain_page = page
    plain_page = plain_page[:plain_page.rfind('[+] Mais campeonatos')]
    plain_page = plain_page.replace('Ver tabela e classificação', '')
    
    plain_page = remove_between(plain_page, "Placar ao vivo de", "dos jogos de hoje.")
    plain_page = remove_between(plain_page, "Aplicativo de", "Meu Mundial de Clubes")

    new_text = plain_page
    old_text = ''
    while new_text != old_text:
        old_text = new_text
        new_text = old_text.replace('\n\n', '\n')
    plain_page = new_text

    #print("PROCESSING:\n", plain_page)

    try:
        if MODEL_VERSION != "gemini-1.0-pro":
            #response = EXTRACTOR_MODEL.generate_content(plain_page)
            response = EXTRACTOR_MODEL.generate_content("HTML PAGE TO BE PARSED: \n\n" + plain_page)
        else:
            prompt = "INSTRUCTIONS: \n\n" + SYSTEM_PROMPT + "\n\nHTML PAGE TO BE PARSED: \n\n" + plain_page
            response = EXTRACTOR_MODEL.generate_content(prompt)
    except Exception as e:
        print('Error in generating content:')
        print(e)
        return ""

    text_out = response.text
    #text_out = text_out.replace('```json', '').replace('```', '')
    text_out = text_out.replace('```', '')

    # save the file
    with open(filename, 'w', encoding='utf8') as file:
        file.write(text_out)

    return text_out



if __name__ == '__main__':
    matches_info = get_matches_info('yesterday')
    print(matches_info)