tensorgirl commited on
Commit
284c80a
1 Parent(s): 30040ca

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +122 -155
utils.py CHANGED
@@ -1,156 +1,123 @@
1
- import datetime
2
- from urllib.request import Request, urlopen
3
- from pypdf import PdfReader
4
- from io import StringIO
5
- import io
6
- import pandas as pd
7
- import os
8
- import torch
9
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
- from transformers import pipeline
11
- from openai import OpenAI
12
- from groq import Groq
13
- import time
14
- from openai import OpenAI
15
-
16
- #openai_key = "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"
17
- #client = OpenAI(api_key = openai_key)
18
- desc = pd.read_excel('Descriptor.xlsx',header = None)
19
- desc_list = desc.iloc[:,0].to_list()
20
-
21
- def callAzure(prompt,text):
22
-
23
- url = "https://Mistral-large-tmhcg-serverless.eastus2.inference.ai.azure.com"
24
- api_key = "LB0ha1R4k3pNpHl68P3VtUZ3sMLr3wT7"
25
- client = OpenAI(base_url=url, api_key=api_key)
26
- msg = "{} {}".format(prompt, text)
27
- msg = msg[:7000]
28
-
29
- response = client.chat.completions.create(
30
- messages=[
31
- {
32
- "role": "user",
33
- "content": msg,
34
- }
35
- ],
36
- model="azureai",
37
- )
38
-
39
- return response.choices[0].message.content
40
-
41
- def call(prompt, text):
42
- client = Groq(api_key=os.getenv("key"),)
43
-
44
- prompt = prompt + " Answer only the summary, no instructions"
45
- chat_completion = client.chat.completions.create(
46
- messages=[
47
- {
48
- "role": "user",
49
- "content": "{} {}".format(prompt, text),
50
- }
51
- ],
52
- model=model,
53
- )
54
-
55
- return chat_completion.choices[0].message.content
56
-
57
- def filter(input_json):
58
-
59
- sym = pd.read_excel('symbol.xlsx',header = None)
60
- sym_list = sym.iloc[:,0].to_list()
61
-
62
- if input_json['FileURL']==None or input_json['FileURL'].lower()=='null':
63
- return [0,"File_URL"]
64
- if input_json['symbol']== 'null' or input_json['symbol'] not in sym_list:
65
- return [0,"symbol"]
66
- if input_json['TypeofAnnouncement'] not in ['General_Announcements','Outcome','General']:
67
- return [0,"Annoucement"]
68
- if input_json['Descriptor'] not in desc_list:
69
- return [0,"Desc"]
70
-
71
- url = 'https://www.bseindia.com/xml-data/corpfiling/AttachLive/'+ input_json['FileURL'].split('Pname=')[-1]
72
- req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
73
- html = urlopen(req)
74
- cont = html.read()
75
- reader = PdfReader(io.BytesIO(cont))
76
- content = ''
77
- for i in range(len(reader.pages)):
78
- content+= reader.pages[i].extract_text()
79
- document = content
80
-
81
- return [1, document]
82
-
83
- def summary(input_json):
84
-
85
- prompt = pd.read_excel('DescriptorPrompt.xlsx')
86
- promptShort = prompt.iloc[:,1].to_list()
87
- promptLong = prompt.iloc[:,2].to_list()
88
-
89
- output = {}
90
- filtering_results = filter(input_json)
91
- if filtering_results[0] == 0:
92
- #return 0
93
- return filtering_results[1]
94
-
95
- id = desc_list.index(input_json['Descriptor'])
96
- long_text = filtering_results[1]
97
-
98
- url = 'https://www.bseindia.com/xml-data/corpfiling/AttachLive/'+ input_json['FileURL'].split('Pname=')[-1]
99
-
100
- output["Link to BSE website"] = url
101
-
102
- output["Date of time of receiving data from BSE"] = input_json["newsdate"] + "Z"
103
-
104
- output["Stock Ticker"] = input_json['symbol']
105
-
106
- answer = callAzure(promptShort[id], long_text)
107
- try:
108
- idx = answer.index("\n")
109
- except:
110
- idx = -2
111
- output['Short Summary'] = answer[idx+2:]
112
-
113
- answer = callAzure(promptLong[id], long_text)
114
- try:
115
- idx = answer.index("\n")
116
- except:
117
- idx = -2
118
- output['Long summary'] = answer[idx+2:]
119
-
120
- prompt = "1 word Financial SEO tag for this news article"
121
- answer = callAzure(prompt, output['Short Summary'])
122
- try:
123
- idx = answer.index("\n")
124
- except:
125
- idx = -2
126
- output['Tag'] = answer[idx+2:]
127
-
128
- prompt = "Give a single headline for this News Article"
129
- answer = callAzure(prompt, output['Short Summary'])
130
- try:
131
- idx = answer.index("\n")
132
- except:
133
- idx = -2
134
- output['Headline'] = answer[idx+2:]
135
-
136
- utc_now = datetime.datetime.utcnow()
137
- ist_now = utc_now.astimezone(datetime.timezone(datetime.timedelta(hours=5, minutes=30)))
138
-
139
- Date = ist_now.strftime("%Y-%m-%d")
140
- time = ist_now.strftime("%X")
141
- output['Date and time of data delivery from Skylark'] = Date+"T"+time+"Z"
142
-
143
- prompt = "Answer in one word the sentiment of this News out of Positive, Negative or Neutral {}"
144
- output['Sentiment'] = callAzure(prompt, output['Short Summary'])
145
-
146
- #time.sleep(60)
147
- # response = client.images.generate(
148
- # model="dall-e-3",
149
- # prompt=headline.text,
150
- # size="1024x1024",
151
- # quality="standard",
152
- # n=1
153
- # )
154
- # output["Link to Infographic (data visualization only)] = response.data[0].url
155
-
156
  return output
 
1
+ import datetime
2
+ from urllib.request import Request, urlopen
3
+ from pypdf import PdfReader
4
+ from io import StringIO
5
+ import io
6
+ import pandas as pd
7
+ import os
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ from transformers import pipeline
11
+ from openai import OpenAI
12
+ from groq import Groq
13
+ import time
14
+ from openai import OpenAI
15
+
16
+ #openai_key = "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"
17
+ #client = OpenAI(api_key = openai_key)
18
+ desc = pd.read_excel('Descriptor.xlsx',header = None)
19
+ desc_list = desc.iloc[:,0].to_list()
20
+
21
+ def callAzure(prompt,text):
22
+
23
+ url = "https://Mistral-large-tmhcg-serverless.eastus2.inference.ai.azure.com"
24
+ api_key = "LB0ha1R4k3pNpHl68P3VtUZ3sMLr3wT7"
25
+ client = OpenAI(base_url=url, api_key=api_key)
26
+ msg = "{} {}".format(prompt, text)
27
+
28
+ response = client.chat.completions.create(
29
+ messages=[
30
+ {
31
+ "role": "user",
32
+ "content": msg,
33
+ }
34
+ ],
35
+ model="azureai",
36
+ )
37
+
38
+ return response.choices[0].message.content
39
+
40
+ def filter(input_json):
41
+
42
+ sym = pd.read_excel('symbol.xlsx',header = None)
43
+ sym_list = sym.iloc[:,0].to_list()
44
+
45
+ if input_json['FileURL']==None or input_json['FileURL'].lower()=='null':
46
+ return [0,"File_URL"]
47
+ if input_json['symbol']== 'null' or input_json['symbol'] not in sym_list:
48
+ return [0,"symbol"]
49
+ if input_json['TypeofAnnouncement'] not in ['General_Announcements','Outcome','General']:
50
+ return [0,"Annoucement"]
51
+ if input_json['Descriptor'] not in desc_list:
52
+ return [0,"Desc"]
53
+
54
+ url = 'https://www.bseindia.com/xml-data/corpfiling/AttachLive/'+ input_json['FileURL'].split('Pname=')[-1]
55
+ req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
56
+ html = urlopen(req)
57
+ cont = html.read()
58
+ reader = PdfReader(io.BytesIO(cont))
59
+ content = ''
60
+ for i in range(len(reader.pages)):
61
+ content+= reader.pages[i].extract_text()
62
+ document = content
63
+
64
+ return [1, document]
65
+
66
+ def summary(input_json):
67
+
68
+ prompt = pd.read_excel('DescriptorPrompt.xlsx')
69
+ promptShort = prompt.iloc[:,1].to_list()
70
+ promptLong = prompt.iloc[:,2].to_list()
71
+
72
+ output = {}
73
+ filtering_results = filter(input_json)
74
+ if filtering_results[0] == 0:
75
+ #return 0
76
+ return filtering_results[1]
77
+
78
+ id = desc_list.index(input_json['Descriptor'])
79
+ long_text = filtering_results[1]
80
+
81
+ url = 'https://www.bseindia.com/xml-data/corpfiling/AttachLive/'+ input_json['FileURL'].split('Pname=')[-1]
82
+
83
+ output["Link to BSE website"] = url
84
+
85
+ output["Date of time of receiving data from BSE"] = input_json["newsdate"] + "Z"
86
+
87
+ output["Stock Ticker"] = input_json['symbol']
88
+
89
+ answer = callAzure(promptShort[id], long_text)
90
+ output['Short Summary'] = answer
91
+
92
+ answer = callAzure(promptLong[id], long_text)
93
+ output['Long summary'] = answer
94
+
95
+ prompt = "1 word Financial SEO tag for this news article"
96
+ answer = callAzure(prompt, output['Short Summary'])
97
+ output['Tag'] = answer
98
+
99
+ prompt = "Give a single headline for this News Article"
100
+ answer = callAzure(prompt, output['Short Summary'])
101
+ output['Headline'] = answer
102
+
103
+ utc_now = datetime.datetime.utcnow()
104
+ ist_now = utc_now.astimezone(datetime.timezone(datetime.timedelta(hours=5, minutes=30)))
105
+
106
+ Date = ist_now.strftime("%Y-%m-%d")
107
+ time = ist_now.strftime("%X")
108
+ output['Date and time of data delivery from Skylark'] = Date+"T"+time+"Z"
109
+
110
+ prompt = "Answer in one word the sentiment of this News out of Positive, Negative or Neutral {}"
111
+ output['Sentiment'] = callAzure(prompt, output['Short Summary'])
112
+
113
+ #time.sleep(60)
114
+ # response = client.images.generate(
115
+ # model="dall-e-3",
116
+ # prompt=headline.text,
117
+ # size="1024x1024",
118
+ # quality="standard",
119
+ # n=1
120
+ # )
121
+ # output["Link to Infographic (data visualization only)] = response.data[0].url
122
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return output