File size: 4,522 Bytes
cd63cf6
 
 
 
 
 
 
 
 
 
103de27
cd63cf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103de27
cd63cf6
 
 
 
103de27
 
 
 
cd63cf6
 
 
 
 
 
 
 
 
 
 
103de27
 
 
cd63cf6
 
 
103de27
cd63cf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103de27
cd63cf6
 
 
 
103de27
cd63cf6
 
 
 
103de27
cd63cf6
 
 
 
 
 
 
 
 
 
103de27
cd63cf6
 
 
 
103de27
cd63cf6
 
 
 
103de27
cd63cf6
 
 
103de27
cd63cf6
103de27
cd63cf6
103de27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import csv
import datetime
import json
import os
import pandas as pd

import gspread
from google.oauth2.service_account import Credentials
from utils_groupclassification.research_html_hybrid import research_html_hybrid
from utils_groupclassification.research_pdf_hybrid import research_pdf_hybrid
from utils_groupclassification.check_openai import co
from src.myLogger import set_logger
# from utils.save_xlsx import save_google_spreadsheet

logger = set_logger("my_app", level="INFO")


def _init_client(auth_path):
    json_open = open(auth_path, "r")
    service_account_key = json.load(json_open)
    credentials = Credentials.from_service_account_info(service_account_key)
    scoped_credentials = credentials.with_scopes(
        [
            "https://spreadsheets.google.com/feeds",
            "https://www.googleapis.com/auth/drive",
        ]
    )
    Client = gspread.authorize(scoped_credentials)
    return Client


def write_gspread(company_information):
    return write_xlsx(company_information)


def group_change(answer, group):
    if answer == 'Group 1-1' and group == 'Group 5':
        group = 'Group 1-1'
    elif answer == 'Group 3' and (group == 'Group 1-1' or group == 'Group 5'):
        group = 'Group 3'

    return group


def write_xlsx(company_name) -> list[list]:
    base_path = "./gspread"
    os.makedirs(base_path, exist_ok=True)
    with open("./output.json") as f:
        config = json.load(f)

    input_urls = []
    related_url = []
    unrelated_url = []
    other_url = []
    for value in config:
        input_urls.append(value)

    group = 'Group 5'

    logger.info(f"urls: {input_urls}")
    for url in input_urls:
        logger.info(f"company_name: {company_name}, url: {url}")
        try:
            # urlの周りに余計な文字列がある場合に削除
            if url.endswith("'"):
                url = url[:-1]
            if url.endswith("']"):
                url = url[:-2]
            # urlの最後がpdfかhtmlかで処理を分ける
            if url.endswith(".pdf"):
                logger.info(f"pdf: {url}")
                # co関数でurl先の情報が会社名と一致するか判定
                judge, reason = co(company_name, url)
                logger.info(f"judge: {judge}, reason: {reason}")
                # 一致する場合はresearch_pdf_hybrid関数を実行
                if judge == 1:
                    logger.info("research_pdf_hybrid")
                    answer = research_pdf_hybrid(url, company_name)
                    group = group_change(answer, group)
                    related_url.append(url)
                # 一致しない場合はreasonを返す
                elif judge == 0:
                    logger.info(f"reason: {reason}")
                    answer = reason
                    unrelated_url.append(url)
                # 取得できない場合はurl先の情報が取得できない旨を返す
                elif judge == -1:
                    logger.debug("url先の情報が取得できません")
                    answer = "url先の情報が取得できません"
                    other_url.append(url)
            else:
                logger.info(f"html: {url}")
                # co関数でurl先の情報が会社名と一致するか判定
                judge, reason = co(company_name, url)
                logger.info(f"judge: {judge}, reason: {reason}")
                # 一致する場合はresearch_html_hybrid関数を実行
                if judge == 1:
                    logger.info("research_html_hybrid")
                    answer = research_html_hybrid(url, company_name)
                    group = group_change(answer, group)
                    related_url.append(url)
                # 一致しない場合はreasonを返す
                elif judge == 0:
                    logger.info(f"reason: {reason}")
                    answer = reason
                    unrelated_url.append(url)
                # 取得できない場合はurl先の情報が取得できない旨を返す
                elif judge == -1:
                    logger.debug("url先の情報が取得できません")
                    answer = "url先の情報が取得できません"
                    other_url.append(url)
        except Exception as e:
            logger.error(f"Error: {e}")
            answer = ""
            other_url.append(url)

    logger.info(f'Group: {group}')

    return group, related_url, unrelated_url, other_url