Spaces:

iarbel
/

amazon-feature-bullets-demo

Sleeping

App Files Files Community

iarbel commited on Nov 17, 2023

Commit

00f57d4

1 Parent(s): 4f7fb1b

add src files

Browse files

Files changed (4) hide show

.gitignore +160 -0
src/__init__.py +0 -0
src/few_shot_funcs.py +143 -0
src/scrape.py +98 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

src/__init__.py ADDED Viewed

File without changes

src/few_shot_funcs.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import re
+import openai
+import inflect
+import pandas as pd
+from typing import Dict
+from datasets import load_dataset
+from IPython.display import display, HTML
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.vectorstores.utils import DistanceStrategy
+import os
+OPENAI_KEY = ''
+openai.api_key = OPENAI_KEY
+os.environ['OPENAI_API_KEY'] = OPENAI_KEY
+# Constants
+FS_COLUMNS = ['asin', 'category', 'title', 'tech_process', 'labels']
+MAX_TOKENS = 700
+USER_TXT = 'Write feature-bullets for an Amazon product page. ' \
+           'Title: {title}. Technical details: {tech_data}.\n\n### Feature-bullets:'
+# Load few-shot dataset
+FS_DATASET = load_dataset('iarbel/amazon-product-data-filter', split='validation')
+# Prepare Pandas DFs with the relevant columns
+FS_DS = FS_DATASET.to_pandas()[FS_COLUMNS]
+# Load vector store
+DB = FAISS.load_local('data/vector_stores/amazon-product-embedding', OpenAIEmbeddings(),
+                      distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)
+class Conversation:
+    """
+    A class to construct conversations with the ChatAPI
+    """
+    def __init__(self):
+        self.messages = [{'role': 'system',
+                          'content': 'You are a helpful assistant. Your task is to write feature-bullets for an Amazon product page.'}]
+    def add_message(self, role: str, content: str) -> None:
+        # Validate inputs
+        role = role.lower()
+        last_role = self.messages[-1]['role']
+        if role not in ['user', 'assistant']:
+            raise ValueError('Roles can be "user" or "assistant" only')
+        if role == 'user' and last_role not in ['system', 'assistant']:
+            raise ValueError('"user" message can only follow "assistant" message')
+        elif role == 'assistant' and last_role != 'user':
+            raise ValueError('"assistant" message can only follow "user" message')
+        message = {"role": role, "content": content}
+        self.messages.append(message)
+    def display_conversation(self) -> None:
+        SEP = '\n'
+        for message in self.messages:
+            if message['role'] == 'system':
+                display(HTML(f'<b>{message["content"]}</b>'))
+            elif message['role'] == 'user':
+                msg_align = message["content"].replace("Title:", "<br><b>Title:</b>")\
+                    .replace("Technical details:", "<br><b>Technical details:</b>").replace("### Feature-bullets:", "<br><b>Feature-bullets:</b>")
+                display(HTML(f'<p style="background-color:White; color:Black; padding:5px;">{msg_align}</p>'))
+            else:
+                msg_align = message["content"].lstrip(SEP).replace(SEP, "<br><br>")
+                display(HTML(f'<p style="background-color:LightGray; color:Black; padding:5px;">{msg_align}</p>'))
+def api_call(messages: Dict[str, str], temperature: float = 0.7, top_p: int = 1, n_responses: int = 1) -> dict:
+    """
+    A function to call the ChatAPI. Taken in a conversation, and the optional params temperature (controls randomness) and n_responses
+    """
+    params = {'model': 'gpt-3.5-turbo', 'messages': messages, 'temperature': temperature, 'max_tokens': MAX_TOKENS, 'n': n_responses, 'top_p': top_p}
+    response = openai.ChatCompletion.create(**params)
+    text = [response['choices'][i]['message']['content'] for i in range(n_responses)]
+    out = {'object': 'chat', 'usage': response['usage']._previous, 'text': text}
+    return out
+class FewShotData:
+    def __init__(self, few_shot_df: pd.DataFrame, vector_db: FAISS):
+        self.few_shot_df = few_shot_df
+        self.vector_db = vector_db
+    def extract_few_shot_data(self, target_title: str, k_shot: int = 2, **db_kwargs) -> pd.DataFrame:
+         # Find relevant products
+        target_title_vector = OpenAIEmbeddings().embed_query(target_title)
+        similarity_list_mmr = self.vector_db.max_marginal_relevance_search_with_score_by_vector(target_title_vector, k=k_shot, **db_kwargs)
+        few_shot_titles = [i[0].page_content for i in similarity_list_mmr]
+        # Extract relevant data
+        few_shot_data = self.few_shot_df[self.few_shot_df['title'].isin(few_shot_titles)][['title', 'tech_process', 'labels']]
+        return few_shot_data
+    def construct_few_shot_conversation(self, target_title: str, target_tech_data: str, few_shot_data: pd.DataFrame) -> Conversation:
+        # Structure the few-shott data
+        fs_titles = few_shot_data['title'].to_list()
+        fs_tech_data = few_shot_data['tech_process'].to_list()
+        fs_labels = few_shot_data['labels'].to_list()
+        # Init a conversation, populate with few-shot data
+        conv = Conversation()
+        for title, tech_data, lables in zip(fs_titles, fs_tech_data, fs_labels):
+            conv.add_message('user', USER_TXT.format(title=title, tech_data=tech_data))
+            conv.add_message('assistant',lables)
+        # Add the final user prompt
+        conv.add_message('user', USER_TXT.format(title=target_title, tech_data=target_tech_data))
+        return conv
+def return_is_are(text: str) -> str:
+    engine = inflect.engine()
+    res = 'is' if not engine.singular_noun(text) else 'are'
+    return res
+def format_tech_as_str(tech_data):
+    tech_format = [f'{k} {return_is_are(k)} {v}' for k, v in tech_data.to_numpy() if k and v]
+    tech_str = '. '.join(tech_format)
+    return tech_str
+def generate_data(title: str, tech_process: str, few_shot_df: pd.DataFrame, vector_db: FAISS) -> str:
+    fs_example = FewShotData(few_shot_df=few_shot_df, vector_db=vector_db)
+    fs_data = fs_example.extract_few_shot_data(target_title=title, k_shot=2)
+    fs_conv = fs_example.construct_few_shot_conversation(target_title=title,
+                                                         target_tech_data=tech_process,
+                                                         few_shot_data=fs_data)
+    api_res = api_call(fs_conv.messages, temperature=0.7)
+    feature_bullets = "## Feature-Bullets\n" + api_res['text'][0]
+    return feature_bullets
+def check_url_structure(url: str) -> bool:
+    pattern = r"https://www.amazon.com(/.+)?/dp/[a-zA-Z0-9]{10}/?$"
+    return bool(re.match(pattern, url))

src/scrape.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import re
+import requests
+from base64 import b64decode
+from bs4 import BeautifulSoup
+from typing import Dict
+Z_KEY = ''
+def zyte_call(url: str) -> bytes:
+    api_response = requests.post(
+        "https://api.zyte.com/v1/extract",
+        auth=(Z_KEY, ""),
+        json={
+            "url": url,
+            "httpResponseBody": True
+        },
+    )
+    http_response_body: bytes = b64decode(
+        api_response.json()["httpResponseBody"])
+    return http_response_body
+def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]:
+    # Get ASIN
+    try:
+        asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
+    except TypeError:
+        asin = None
+    # Get title
+    search = soup.find('span', id="productTitle")
+    title = search.text.lstrip().rstrip() if search else None
+    # Get feature-bullets
+    search = soup.find('div', id="feature-bullets")
+    if search:
+        bullet_search = search.find_all('span', class_='a-list-item')
+        feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
+        # Remove unwanted bullets
+        feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
+    else:
+        feature_bullets = None
+    # Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
+    kv_res = parse_kv_table(soup)
+    tech_res = parse_tech_table(soup)
+    ap_data = parse_ap_table(soup)
+    tech_data = {**kv_res, **tech_res, **ap_data}
+    res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
+    return res
+def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
+    kv_res = {}
+    try:
+        search = soup.find('div', id='productOverview_feature_div')
+        table = search.find('table')
+        data = table.find_all('tr')
+        for d in data:
+            kv = d.find_all('td')
+            k = kv[0].text.lstrip().rstrip()
+            v = kv[1].text.lstrip().rstrip()
+            kv_res[k] = v
+    except AttributeError:
+        pass
+    return kv_res
+def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
+    tech_res = {}
+    tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
+    if tables:
+        for tab in tables:
+            data = tab.find_all('tr')
+            for d in data:
+                key = d.find('th').text.lstrip().rstrip()
+                value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
+                tech_res[key] = value
+    return tech_res
+def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
+    ap_res = {}
+    tech = soup.find_all('div', id='tech')
+    for div in tech:
+        tables = div.find_all('table')
+        for tab in tables:
+            data = tab.find_all('tr')
+            for d in data:
+                kv = d.find_all('td')
+                if kv:
+                    key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
+                    value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
+                    ap_res[key] = value
+    return ap_res