Spaces:
Runtime error
Runtime error
192345678
commited on
Commit
•
2f30f9f
1
Parent(s):
3834d58
add *pdf
Browse files- .gitattributes +1 -0
- app.py +205 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
|
3 |
+
from llama_index.core.graph_stores import SimpleGraphStore
|
4 |
+
from llama_index.llms.openai import OpenAI
|
5 |
+
from llama_index.core import Settings
|
6 |
+
from IPython.display import Markdown, display
|
7 |
+
from llama_index.core import VectorStoreIndex, get_response_synthesizer
|
8 |
+
from llama_index.core.retrievers import VectorIndexRetriever
|
9 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
10 |
+
from llama_index.core.postprocessor import SimilarityPostprocessor
|
11 |
+
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
|
12 |
+
import logging
|
13 |
+
import sys
|
14 |
+
from llama_index.core import StorageContext, load_index_from_storage
|
15 |
+
|
16 |
+
# logging
|
17 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
18 |
+
|
19 |
+
## 最普通的RAG 效果比较差舍弃
|
20 |
+
# documents = SimpleDirectoryReader("./dataset").load_data()
|
21 |
+
# # indexing
|
22 |
+
# index = VectorStoreIndex.from_documents(documents, show_progress = True)
|
23 |
+
#
|
24 |
+
# # save index
|
25 |
+
# index.storage_context.persist(persist_dir="./index")
|
26 |
+
#
|
27 |
+
# # rebuild storage context
|
28 |
+
# storage_context = StorageContext.from_defaults(persist_dir="./index")
|
29 |
+
#
|
30 |
+
# # load index
|
31 |
+
# index = load_index_from_storage(storage_context)
|
32 |
+
#
|
33 |
+
# retriever = index.as_retriever(
|
34 |
+
# similarity_top_k = 10
|
35 |
+
# )
|
36 |
+
#
|
37 |
+
# question = """ You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
|
38 |
+
# 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
|
39 |
+
# 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
|
40 |
+
#
|
41 |
+
# Question: 请为我解释课件第10页seq to seq结构图的含义
|
42 |
+
# """
|
43 |
+
#
|
44 |
+
# # configure response synthesizer
|
45 |
+
# response_synthesizer = get_response_synthesizer()
|
46 |
+
#
|
47 |
+
# # assemble query engine
|
48 |
+
# query_engine = RetrieverQueryEngine(
|
49 |
+
# retriever=retriever,
|
50 |
+
# response_synthesizer=response_synthesizer,
|
51 |
+
# node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
|
52 |
+
# )
|
53 |
+
#
|
54 |
+
# response = query_engine.query(question)
|
55 |
+
#
|
56 |
+
# # test retriever result
|
57 |
+
# nodes = retriever.retrieve(question)
|
58 |
+
# for i in range(10):
|
59 |
+
# print("label page ", nodes[i].node.metadata["page_label"])
|
60 |
+
#
|
61 |
+
# # test result
|
62 |
+
# page = 0
|
63 |
+
# sum_correct = 0
|
64 |
+
# for i in range(1, 49):
|
65 |
+
# page_list = []
|
66 |
+
# question = f"""
|
67 |
+
# You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
|
68 |
+
# 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
|
69 |
+
# 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
|
70 |
+
# Question: 请为我总结课件第{i}页的主要内容
|
71 |
+
# """
|
72 |
+
# nodes = retriever.retrieve(question)
|
73 |
+
# for j in range(10):
|
74 |
+
# page_list.append(int(nodes[j].node.metadata["page_label"]))
|
75 |
+
# # print("label page ", nodes[j].node.metadata["page_label"])
|
76 |
+
# print(f"label list for page {i}", page_list)
|
77 |
+
# if i in page_list:
|
78 |
+
# sum_correct = sum_correct + 1
|
79 |
+
# print(f"page {i} 索引正确")
|
80 |
+
# print("正确率", (sum_correct/67)*100, "%")
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
# # knowledge graph index 知识图谱RAG,将PDF课件以知识图谱的形式作为索引,使用RAG来回答问题
|
85 |
+
# from llama_index.core import VectorStoreIndex, get_response_synthesizer
|
86 |
+
# from llama_index.core.retrievers import VectorIndexRetriever
|
87 |
+
# from llama_index.core.query_engine import RetrieverQueryEngine
|
88 |
+
# from llama_index.core.postprocessor import SimilarityPostprocessor
|
89 |
+
# from llama_index.core.query_engine import RetrieverQueryEngine
|
90 |
+
# from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
|
91 |
+
# from llama_index.core import StorageContext
|
92 |
+
#
|
93 |
+
# graph_store = SimpleGraphStore()
|
94 |
+
# storage_context = StorageContext.from_defaults(graph_store=graph_store)
|
95 |
+
#
|
96 |
+
# # NOTE: can take a while!
|
97 |
+
# index = KnowledgeGraphIndex.from_documents(
|
98 |
+
# documents,
|
99 |
+
# max_triplets_per_chunk=2,
|
100 |
+
# storage_context=storage_context,
|
101 |
+
# )
|
102 |
+
# graph_rag_retriever = index.as_retriever()
|
103 |
+
#
|
104 |
+
# question = """
|
105 |
+
# You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
|
106 |
+
# 1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
|
107 |
+
# 2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
|
108 |
+
#
|
109 |
+
#
|
110 |
+
# Question: 请为我解释课件第10页seq to seq结构图的含义
|
111 |
+
# """
|
112 |
+
# # configure response synthesizer
|
113 |
+
# response_synthesizer = get_response_synthesizer()
|
114 |
+
#
|
115 |
+
# # assemble query engine
|
116 |
+
# query_engine = RetrieverQueryEngine(
|
117 |
+
# retriever=graph_rag_retriever,
|
118 |
+
# response_synthesizer=response_synthesizer,
|
119 |
+
# node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
|
120 |
+
# )
|
121 |
+
# response = query_engine.query(question)
|
122 |
+
#
|
123 |
+
#
|
124 |
+
# try PDF parser
|
125 |
+
# from pathlib import Path
|
126 |
+
#
|
127 |
+
# from llama_index.readers.file import PyMuPDFReader
|
128 |
+
#
|
129 |
+
# loader = PyMuPDFReader()
|
130 |
+
# documents = loader.load_data(file_path=Path("./test_document/TPML_day_2_v3.pdf"), metadata=True)
|
131 |
+
# indexing
|
132 |
+
# index_pdf = VectorStoreIndex.from_documents(documents, show_progress = True)
|
133 |
+
#
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
# RAG + rerank 方法,
|
140 |
+
from llama_index.core import SimpleDirectoryReader
|
141 |
+
|
142 |
+
documents = SimpleDirectoryReader("dataset").load_data()
|
143 |
+
from llama_index.core import VectorStoreIndex
|
144 |
+
from llama_index.core.node_parser import SentenceSplitter
|
145 |
+
|
146 |
+
splitter = SentenceSplitter(chunk_size=256)
|
147 |
+
|
148 |
+
index = VectorStoreIndex.from_documents(documents, transformations=[splitter])
|
149 |
+
|
150 |
+
from llama_index.retrievers.bm25 import BM25Retriever
|
151 |
+
|
152 |
+
vector_retriever = index.as_retriever(similarity_top_k=2)
|
153 |
+
|
154 |
+
bm25_retriever = BM25Retriever.from_defaults(
|
155 |
+
docstore=index.docstore, similarity_top_k=2
|
156 |
+
)
|
157 |
+
|
158 |
+
from llama_index.retrievers.bm25 import BM25Retriever
|
159 |
+
|
160 |
+
vector_retriever = index.as_retriever(similarity_top_k=10)
|
161 |
+
|
162 |
+
bm25_retriever = BM25Retriever.from_defaults(
|
163 |
+
docstore=index.docstore, similarity_top_k=10
|
164 |
+
)
|
165 |
+
|
166 |
+
from llama_index.core.retrievers import QueryFusionRetriever
|
167 |
+
|
168 |
+
retriever = QueryFusionRetriever(
|
169 |
+
[vector_retriever, bm25_retriever],
|
170 |
+
similarity_top_k=12,
|
171 |
+
num_queries=4, # set this to 1 to disable query generation
|
172 |
+
mode="reciprocal_rerank",
|
173 |
+
use_async=True,
|
174 |
+
verbose=True,
|
175 |
+
query_gen_prompt=
|
176 |
+
"You are a helpful assistant that generates multiple search queries based on a "
|
177 |
+
"single input query. Generate {num_queries} search queries, one on each line, "
|
178 |
+
"related to the following input query:\n"
|
179 |
+
"Query: {query}\n"
|
180 |
+
"Queries:\n", # we could override the query generation prompt here
|
181 |
+
)
|
182 |
+
|
183 |
+
|
184 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
185 |
+
|
186 |
+
query_engine = RetrieverQueryEngine.from_args(retriever)
|
187 |
+
# apply nested async to run in a notebook
|
188 |
+
import nest_asyncio
|
189 |
+
nest_asyncio.apply()
|
190 |
+
|
191 |
+
|
192 |
+
for i in range(1, 69):
|
193 |
+
question = f"""
|
194 |
+
You are a skilled teaching assistant with 10 years of experience in Artificial Intelligence, particularly Natural Language Processing. You have two main job responsibilities:\
|
195 |
+
1. Discover the parts of the course lesson plan that students are more interested in through dialogue with them. At the same time, you will combine your knowledge to introduce the relevant content to the students, so that the students can have a deeper understanding of the relevant content in the lesson plan that they are interested in or don't know much about yet.
|
196 |
+
2. Answer students' questions about AI and practical language processing according to the course handout, trying to explain it step by step to ensure that students can fully understand all the concepts or related applications, and give some code as examples if necessary. If you can find relevant information in the course handouts, please try to answer based on your own knowledge and experience. Remember not to give incorrect answers. If you are unsure of an answer, tell the student that you are unsure. If the answer is correct and accurate, you will be paid well. Therefore, please try to address all questions asked by the student.
|
197 |
+
|
198 |
+
|
199 |
+
Question: 请为我解释课件第{i}页的含义
|
200 |
+
"""
|
201 |
+
response = query_engine.query(question)
|
202 |
+
nodes = retriever.retrieve(question)
|
203 |
+
for j in range(10):
|
204 |
+
print("label page ", nodes[j].node.metadata["page_label"])
|
205 |
+
print(response)
|