Crison11 commited on
Commit
8e7d687
1 Parent(s): 9f1bcff

Upload datageneration.py

Browse files

A simplest way to generate QA pairs using llm+ prompting

Files changed (1) hide show
  1. datageneration.py +128 -0
datageneration.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import CharacterTextSplitter
2
+ import os
3
+ import PyPDF2
4
+ import openai
5
+ import json
6
+ import csv
7
+ from openai import AzureOpenAI
8
+ from openai import OpenAI
9
+
10
+ def get_text_chunks(text):
11
+ text_splitter = CharacterTextSplitter(
12
+ separator="\n", chunk_size=3000, chunk_overlap=400, length_function=len
13
+ )
14
+ chunks = text_splitter.split_text(text)
15
+ return chunks
16
+
17
+ def read_pdf(file_path):
18
+ pdf_text = ""
19
+
20
+ with open(file_path, 'rb') as pdf_file:
21
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
22
+ number_of_pages = len(pdf_reader.pages)
23
+
24
+ for page_num in range(number_of_pages):
25
+ page = pdf_reader.pages[page_num]
26
+ page_text = page.extract_text()
27
+ pdf_text += page_text
28
+
29
+ return pdf_text
30
+
31
+ def pdfs_from_folder(folder_path):
32
+ pdf_texts = [] # List to store the text content of each PDF
33
+
34
+ for filename in os.listdir(folder_path):
35
+ if filename.endswith('.pdf'):
36
+ file_path = os.path.join(folder_path, filename)
37
+ pdf_text = read_pdf(file_path)
38
+ pdf_texts.append(pdf_text)
39
+
40
+ return pdf_texts
41
+
42
+
43
+ SYSTEM_PROMPT = """
44
+ You are an AI whose purpose it is to generate question and answer pairs.
45
+
46
+ It is crucial these question answer pairs are specfic to the context the USER will give you and are related to TECHNICAL content, such that these question answer pairs cannot be retrieved otherwise. DO NOT make up questions and answers that are not related to the context the USER will give you, this will be heavily penalized.
47
+
48
+ If no technical question can be formulated, it is acceptable to return none. You are expected to return the question pair in JSON like so:
49
+
50
+ {
51
+ "question": "What is the operating pressure of TK-3413?",
52
+ "answer": "The operating pressure is 1.5 bar."
53
+ }
54
+
55
+ Examples:
56
+ USER:
57
+ "TK-3413 is a pressure vessel that is used to store water. It is used in the production of the Ford F-150. The operating pressure is 1.5 bar."
58
+ AI:
59
+ {
60
+ "question": "What is the operating pressure of TK-3413?",
61
+ "answer": "The operating pressure is 1.5 bar."
62
+ }
63
+ USER:
64
+ "The captial of France Paris, in Paris lays the Eiffel Tower. The Eiffel Tower is 324 meters tall."
65
+ AI:
66
+ {
67
+ "question": "NONE", # No technical question can be formulated, and any search engine can retrieve this information, so None must be returned.
68
+ "answer": "NONE."
69
+ }
70
+
71
+ """
72
+ # openai.api_type = "azure"
73
+ # openai.api_key = "3803844f0b2b4651842ff3529a71b32f"
74
+ # openai.api_base = "https://hairesearch.openai.azure.com/"
75
+ # openai.api_version = "2024-02-01"
76
+ # client = AzureOpenAI(
77
+ # api_key=os.getenv("3803844f0b2b4651842ff3529a71b32f"),
78
+ # azure_endpoint = os.getenv("https://hairesearch.openai.azure.com/"),
79
+ # api_version="2024-02-01"
80
+ # )
81
+ client = OpenAI(
82
+ api_key=os.getenv("OPENAI_API_KEY")
83
+ )
84
+ def chat_complete(messages):
85
+ return client.chat.completions.create(
86
+ model="gpt-3.5-turbo",
87
+ messages = messages,
88
+ temperature=0.1,
89
+ max_tokens=800,
90
+ top_p=0.95,
91
+ frequency_penalty=0,
92
+ presence_penalty=0,
93
+ stop=None)
94
+
95
+ get_messages = lambda m: [
96
+ {
97
+ "role": "system",
98
+ "content": SYSTEM_PROMPT
99
+ },
100
+ {
101
+ "role": "user",
102
+ "content": f"USER: {m}"
103
+ }
104
+ ]
105
+
106
+ if __name__ == "__main__":
107
+ folder_path = "report"
108
+ all_pdf_texts = pdfs_from_folder(folder_path)
109
+ qa_pairs = []
110
+ for chunk in get_text_chunks(all_pdf_texts[0])[0:100]: #NOTE: notice the limit
111
+ response = chat_complete(get_messages(chunk))
112
+ try:
113
+ response = json.loads(response.choices[0].message.content)
114
+ except:
115
+ continue
116
+ qa_pairs.append(response)
117
+ # print(qa_pairs)
118
+
119
+
120
+ with open('qa_pairs.csv', 'w', newline='', encoding='utf-8') as csvfile:
121
+ fieldnames = ['question', 'answer']
122
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
123
+
124
+ writer.writeheader()
125
+ for pair in qa_pairs:
126
+ writer.writerow(pair)
127
+
128
+ print("QA pairs have been saved to 'qa_pairs.csv'.")