In [1]:
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

from langchain.document_loaders import TextLoader
import langchain
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [3]:
llm = model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
 model_name=model_name,
 model_kwargs={'device': 'cuda'},
 encode_kwargs=encode_kwargs
)

In [4]:
llm = OpenAI()

In [5]:
embeddings = HypotheticalDocumentEmbedder.from_llm(llm,
 bge_embeddings,
 prompt_key="web_search"
 )

In [6]:
embeddings.llm_chain.prompt

PromptTemplate(input_variables=['QUESTION'], template='Please write a passage to answer the question \nQuestion: {QUESTION}\nPassage:')

In [7]:
langchain.debug = True

In [8]:
result = embeddings.embed_query("What are some distinguishing features or amenities that guests frequently highlight about their experience at Willard InterContinental Washington?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
 "prompts": [
 "Please write a passage to answer the question \nQuestion: What are some distinguishing features or amenities that guests frequently highlight about their experience at Willard InterContinental Washington?\nPassage:"
 ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [3.52s] Exiting LLM run with output:
[0m{
 "generations": [
 [
 {
 "text": " \nWillard InterContinental Washington is renowned for its luxurious amenities and distinguished features. Guests often highlight the quality of the hotel's accommodations, from the plush beds and pillows to the premium bath amenities. The hotel's five-star restaurant, Café du Parc, is also a highlight for many guests, who appreciate its sophisticated atmosphere and its wide selection of French-inspired dishes. The hotel's signature bar, The Round Robin, is also a popular choice for guests, with its classic cocktails and relaxed atmosphere. Other amen

In [9]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

loaders = [
 TextLoader('offering_0.001.txt'),
 TextLoader('output_proper_review_chunk_1.txt'),
 TextLoader('output_proper_review_chunk_2.txt'),
 TextLoader('output_proper_review_chunk_3.txt'),

]
docs = []
for l in loaders:
 docs.extend(l.load())

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

texts = text_splitter.split_documents(docs) #split_text

In [10]:
texts

[Document(page_content='[{"hotel_class": 4.0, "region_id": 60763, "url": "http://www.tripadvisor.com/Hotel_Review-g60763-d113317-Reviews-Casablanca_Hotel_Times_Square-New_York_City_New_York.html", "phone": "", "details": null, "address": {"region": "NY", "street-address": "147 West 43rd Street", "postal-code": "10036", "locality": "New York City"}, "type": "hotel", "id": 113317, "name": "Casablanca Hotel Times Square"}, {"hotel_class": 5.0, "region_id": 32655, "url": "http://www.tripadvisor.com/Hotel_Review-g32655-d76049-Reviews-Four_Seasons_Hotel_Los_Angeles_at_Beverly_Hills-Los_Angeles_California.html", "phone": "", "details": null, "address": {"region": "CA", "street-address": "300 S Doheny Dr", "postal-code": "90048", "locality": "Los Angeles"}, "type": "hotel", "id": 76049, "name": "Four Seasons Hotel Los Angeles at Beverly Hills"}, {"hotel_class": 3.5, "region_id": 60763, "url": "http://www.tripadvisor.com/Hotel_Review-g60763-d99352-Reviews-Hilton_Garden_Inn_Times_Square-New_York

In [11]:
llm = OpenAI(n=4, best_of=4)

In [12]:
prompt_template = """Please answer the user's question as providing an analysis of the hotel reviews across different aspects such as service, cleanliness, location, value, sleep quality, rooms, and overall experience. Extract information about specific amenities, notable features, and recurring themes across the reviews to offer a comprehensive understanding of the guests' experiences.
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [13]:
embeddings = HypotheticalDocumentEmbedder(
 llm_chain=llm_chain,
 base_embeddings=bge_embeddings
)

In [14]:
docsearch = Chroma.from_documents(texts, embeddings)

query = "What are some distinguishing features or amenities that guests frequently highlight about their experience at Casablanca Hotel Times Square in New York City?"
docs = docsearch.similarity_search(query)

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
 "prompts": [
 "Please answer the user's question as providing an analysis of the hotel reviews across different aspects such as service, cleanliness, location, value, sleep quality, rooms, and overall experience. Extract information about specific amenities, notable features, and recurring themes across the reviews to offer a comprehensive understanding of the guests' experiences.\nQuestion: What are some distinguishing features or amenities that guests frequently highlight about their experience at Casablanca Hotel Times Square in New York City?\nAnswer:"
 ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [3.01s] Exiting LLM run with output:
[0m{
 "generations": [
 [
 {
 "text": " Guests at Casablanca Hotel Times Square in New York City have consistently praised the hotel's convenient location, friendly staff, and clean rooms. Many travelers have highlighted the hotel's proximity to Times Square, Broa