192345678 commited on
Commit
2f30f9f
1 Parent(s): 3834d58
Files changed (2) hide show
  1. .gitattributes +1 -0
  2. app.py +205 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
3
+ from llama_index.core.graph_stores import SimpleGraphStore
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.core import Settings
6
+ from IPython.display import Markdown, display
7
+ from llama_index.core import VectorStoreIndex, get_response_synthesizer
8
+ from llama_index.core.retrievers import VectorIndexRetriever
9
+ from llama_index.core.query_engine import RetrieverQueryEngine
10
+ from llama_index.core.postprocessor import SimilarityPostprocessor
11
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
12
+ import logging
13
+ import sys
14
+ from llama_index.core import StorageContext, load_index_from_storage
15
+
16
+ # logging
17
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
18
+
19
+ ## 最普通的RAG 效果比较差舍弃
20
+ # documents = SimpleDirectoryReader("./dataset").load_data()
21
+ # # indexing
22
+ # index = VectorStoreIndex.from_documents(documents, show_progress = True)
23
+ #
24
+ # # save index
25
+ # index.storage_context.persist(persist_dir="./index")
26
+ #
27
+ # # rebuild storage context
28
+ # storage_context = StorageContext.from_defaults(persist_dir="./index")
29
+ #
30
+ # # load index
31
+ # index = load_index_from_storage(storage_context)
32
+ #
33
+ # retriever = index.as_retriever(
34
+ # similarity_top_k = 10
35
+ # )
36
+ #
37
+ # question = """ You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
38
+ # 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
39
+ # 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
40
+ #
41
+ # Question: 请为我解释课件第10页seq to seq结构图的含义
42
+ # """
43
+ #
44
+ # # configure response synthesizer
45
+ # response_synthesizer = get_response_synthesizer()
46
+ #
47
+ # # assemble query engine
48
+ # query_engine = RetrieverQueryEngine(
49
+ # retriever=retriever,
50
+ # response_synthesizer=response_synthesizer,
51
+ # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
52
+ # )
53
+ #
54
+ # response = query_engine.query(question)
55
+ #
56
+ # # test retriever result
57
+ # nodes = retriever.retrieve(question)
58
+ # for i in range(10):
59
+ # print("label page ", nodes[i].node.metadata["page_label"])
60
+ #
61
+ # # test result
62
+ # page = 0
63
+ # sum_correct = 0
64
+ # for i in range(1, 49):
65
+ # page_list = []
66
+ # question = f"""
67
+ # You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
68
+ # 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
69
+ # 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
70
+ # Question: 请为我总结课件第{i}页的主要内容
71
+ # """
72
+ # nodes = retriever.retrieve(question)
73
+ # for j in range(10):
74
+ # page_list.append(int(nodes[j].node.metadata["page_label"]))
75
+ # # print("label page ", nodes[j].node.metadata["page_label"])
76
+ # print(f"label list for page {i}", page_list)
77
+ # if i in page_list:
78
+ # sum_correct = sum_correct + 1
79
+ # print(f"page {i} 索引正确")
80
+ # print("正确率", (sum_correct/67)*100, "%")
81
+
82
+
83
+
84
+ # # knowledge graph index 知识图谱RAG,将PDF课件以知识图谱的形式作为索引,使用RAG来回答问题
85
+ # from llama_index.core import VectorStoreIndex, get_response_synthesizer
86
+ # from llama_index.core.retrievers import VectorIndexRetriever
87
+ # from llama_index.core.query_engine import RetrieverQueryEngine
88
+ # from llama_index.core.postprocessor import SimilarityPostprocessor
89
+ # from llama_index.core.query_engine import RetrieverQueryEngine
90
+ # from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
91
+ # from llama_index.core import StorageContext
92
+ #
93
+ # graph_store = SimpleGraphStore()
94
+ # storage_context = StorageContext.from_defaults(graph_store=graph_store)
95
+ #
96
+ # # NOTE: can take a while!
97
+ # index = KnowledgeGraphIndex.from_documents(
98
+ # documents,
99
+ # max_triplets_per_chunk=2,
100
+ # storage_context=storage_context,
101
+ # )
102
+ # graph_rag_retriever = index.as_retriever()
103
+ #
104
+ # question = """
105
+ # You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
106
+ # 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
107
+ # 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
108
+ #
109
+ #
110
+ # Question: 请为我解释课件第10页seq to seq结构图的含义
111
+ # """
112
+ # # configure response synthesizer
113
+ # response_synthesizer = get_response_synthesizer()
114
+ #
115
+ # # assemble query engine
116
+ # query_engine = RetrieverQueryEngine(
117
+ # retriever=graph_rag_retriever,
118
+ # response_synthesizer=response_synthesizer,
119
+ # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
120
+ # )
121
+ # response = query_engine.query(question)
122
+ #
123
+ #
124
+ # try PDF parser
125
+ # from pathlib import Path
126
+ #
127
+ # from llama_index.readers.file import PyMuPDFReader
128
+ #
129
+ # loader = PyMuPDFReader()
130
+ # documents = loader.load_data(file_path=Path("./test_document/TPML_day_2_v3.pdf"), metadata=True)
131
+ # indexing
132
+ # index_pdf = VectorStoreIndex.from_documents(documents, show_progress = True)
133
+ #
134
+
135
+
136
+
137
+
138
+
139
+ # RAG + rerank 方法,
140
+ from llama_index.core import SimpleDirectoryReader
141
+
142
+ documents = SimpleDirectoryReader("dataset").load_data()
143
+ from llama_index.core import VectorStoreIndex
144
+ from llama_index.core.node_parser import SentenceSplitter
145
+
146
+ splitter = SentenceSplitter(chunk_size=256)
147
+
148
+ index = VectorStoreIndex.from_documents(documents, transformations=[splitter])
149
+
150
+ from llama_index.retrievers.bm25 import BM25Retriever
151
+
152
+ vector_retriever = index.as_retriever(similarity_top_k=2)
153
+
154
+ bm25_retriever = BM25Retriever.from_defaults(
155
+ docstore=index.docstore, similarity_top_k=2
156
+ )
157
+
158
+ from llama_index.retrievers.bm25 import BM25Retriever
159
+
160
+ vector_retriever = index.as_retriever(similarity_top_k=10)
161
+
162
+ bm25_retriever = BM25Retriever.from_defaults(
163
+ docstore=index.docstore, similarity_top_k=10
164
+ )
165
+
166
+ from llama_index.core.retrievers import QueryFusionRetriever
167
+
168
+ retriever = QueryFusionRetriever(
169
+ [vector_retriever, bm25_retriever],
170
+ similarity_top_k=12,
171
+ num_queries=4, # set this to 1 to disable query generation
172
+ mode="reciprocal_rerank",
173
+ use_async=True,
174
+ verbose=True,
175
+ query_gen_prompt=
176
+ "You are a helpful assistant that generates multiple search queries based on a "
177
+ "single input query. Generate {num_queries} search queries, one on each line, "
178
+ "related to the following input query:\n"
179
+ "Query: {query}\n"
180
+ "Queries:\n", # we could override the query generation prompt here
181
+ )
182
+
183
+
184
+ from llama_index.core.query_engine import RetrieverQueryEngine
185
+
186
+ query_engine = RetrieverQueryEngine.from_args(retriever)
187
+ # apply nested async to run in a notebook
188
+ import nest_asyncio
189
+ nest_asyncio.apply()
190
+
191
+
192
+ for i in range(1, 69):
193
+ question = f"""
194
+ You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
195
+ 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
196
+ 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
197
+
198
+
199
+ Question: 请为我解释课件第{i}页的含义
200
+ """
201
+ response = query_engine.query(question)
202
+ nodes = retriever.retrieve(question)
203
+ for j in range(10):
204
+ print("label page ", nodes[j].node.metadata["page_label"])
205
+ print(response)