jamescalam commited on
Commit
90ae924
1 Parent(s): d2b37ff

first version

Browse files
Files changed (3) hide show
  1. app.py +52 -0
  2. gif-search-indexer.ipynb +1992 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
6
+
7
+ @st.experimental_singleton(show_spinner=False)
8
+ def init_pinecone():
9
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
10
+ return pinecone.Index('gif-search')
11
+
12
+ @st.experimental_singleton(show_spinner=False)
13
+ def init_retriever():
14
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
15
+
16
+ with st.spinner("Retrieving the Retriever..."):
17
+ retriever = init_retriever()
18
+
19
+ with st.spinner("Connecting to Pinecone..."):
20
+ index = init_pinecone()
21
+
22
+ def card(urls):
23
+ figures = [f"""
24
+ <figure style="margin-top: 5px; margin-bottom: 5px; !important;">
25
+ <img src="{url}" style="width: 130px; height: 100px; padding-left: 5px; padding-right: 5px" >
26
+ </figure>
27
+ """ for url in urls]
28
+ return st.markdown(f"""
29
+ <div style="display: flex; flex-flow: row wrap; text-align: center; justify-content: center;">
30
+ {''.join(figures)}
31
+ </div>
32
+ """, unsafe_allow_html=True)
33
+
34
+ st.write("""
35
+ ## ⚡️ AI-Powered GIF Search ⚡️
36
+
37
+ Search for GIFs using Semantic Search, learn how it works [here](https://www.pinecone.io/learn/gif-search/).
38
+ """)
39
+
40
+ query = st.text_input("What are you looking for?", "")
41
+
42
+ if query != "":
43
+ with st.spinner(text="Similarity Searching..."):
44
+ xq = retriever.encode([query]).tolist()
45
+ xc = index.query(xq, top_k=30, include_metadata=True)
46
+
47
+ urls = []
48
+ for context in xc['matches']:
49
+ urls.append(context['metadata']['url'])
50
+
51
+ with st.spinner(text="Fetching GIFs 🚀🚀🚀"):
52
+ card(urls)
gif-search-indexer.ipynb ADDED
@@ -0,0 +1,1992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# NLP Powered GIF search"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "We will use the [Tumblr GIF Description Dataset](http://raingo.github.io/TGIF-Release/), which contains over 100k animated GIFs and 120K sentences describing its visual content. Using this data with a *vector database* and *retriever* we are able to create an NLP-powered GIF search tool.\n",
15
+ "\n",
16
+ "There are a few packages that must be installed for this notebook to run:"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "pip install -U pandas pinecone-client sentence-transformers tqdm"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {},
31
+ "source": [
32
+ "We must also set the following notebook parameters to display the GIF images we will be working with."
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 1,
38
+ "metadata": {
39
+ "id": "r8KN-iWWdwby"
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "from IPython.display import HTML\n",
44
+ "from IPython.core.interactiveshell import InteractiveShell\n",
45
+ "InteractiveShell.ast_node_interactivity = \"all\""
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "markdown",
50
+ "metadata": {
51
+ "id": "KFIZrga-6Jq_"
52
+ },
53
+ "source": [
54
+ "## Download and Extract Dataset"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "metadata": {},
60
+ "source": [
61
+ "First let's download and extract the dataset. The dataset is available [here](https://github.com/raingo/TGIF-Release) on GitHub. We can use the link below to download the dataset directly. We can also access the link from a browser to directly download the files."
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 2,
67
+ "metadata": {
68
+ "colab": {
69
+ "base_uri": "https://localhost:8080/"
70
+ },
71
+ "id": "ZD4gusO9YB1-",
72
+ "outputId": "2d69fa61-f67a-45c8-ecc4-4d1c9b06f7cf"
73
+ },
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "--2022-08-13 16:40:35-- https://github.com/raingo/TGIF-Release/archive/master.zip\n",
80
+ "Resolving github.com (github.com)... 140.82.114.4\n",
81
+ "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n",
82
+ "HTTP request sent, awaiting response... 302 Found\n",
83
+ "Location: https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master [following]\n",
84
+ "--2022-08-13 16:40:35-- https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master\n",
85
+ "Resolving codeload.github.com (codeload.github.com)... 140.82.114.10\n",
86
+ "Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.\n",
87
+ "HTTP request sent, awaiting response... 200 OK\n",
88
+ "Length: unspecified [application/zip]\n",
89
+ "Saving to: ‘master.zip’\n",
90
+ "\n",
91
+ "master.zip [ <=> ] 11.82M 6.59MB/s in 1.8s \n",
92
+ "\n",
93
+ "2022-08-13 16:40:37 (6.59 MB/s) - ‘master.zip’ saved [12396861]\n",
94
+ "\n"
95
+ ]
96
+ }
97
+ ],
98
+ "source": [
99
+ "# Use wget to download the master.zip file which contains the dataset\n",
100
+ "!wget https://github.com/raingo/TGIF-Release/archive/master.zip"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "metadata": {
107
+ "id": "qLvXp0RtYTTz"
108
+ },
109
+ "outputs": [
110
+ {
111
+ "name": "stdout",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "Archive: master.zip\n",
115
+ "3e54d2f71418d8a2e9f5f61aa5be0edb9c0ac2b8\n",
116
+ " creating: TGIF-Release-master/\n",
117
+ " inflating: TGIF-Release-master/.gitignore \n",
118
+ " inflating: TGIF-Release-master/.gitmodules \n",
119
+ " inflating: TGIF-Release-master/LICENSE \n",
120
+ " inflating: TGIF-Release-master/README.md \n",
121
+ " creating: TGIF-Release-master/code/\n",
122
+ " inflating: TGIF-Release-master/code/README.md \n",
123
+ " creating: TGIF-Release-master/code/crowdflower/\n",
124
+ " extracting: TGIF-Release-master/code/crowdflower/.gitignore \n",
125
+ " inflating: TGIF-Release-master/code/crowdflower/.gitmodules \n",
126
+ " inflating: TGIF-Release-master/code/crowdflower/README.md \n",
127
+ " creating: TGIF-Release-master/code/crowdflower/back-end/\n",
128
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/.gitignore \n",
129
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/deploy.sh \n",
130
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/entity_extract.py \n",
131
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/eval.py \n",
132
+ " creating: TGIF-Release-master/code/crowdflower/back-end/logs/\n",
133
+ " extracting: TGIF-Release-master/code/crowdflower/back-end/logs/.gitignore \n",
134
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/logs/logging.conf \n",
135
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/requirements.txt \n",
136
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/routes.py \n",
137
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/start_nlp_sever.sh \n",
138
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/swear-words \n",
139
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/test-data.sorted \n",
140
+ " creating: TGIF-Release-master/code/crowdflower/front-end/\n",
141
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/.gitignore \n",
142
+ " creating: TGIF-Release-master/code/crowdflower/front-end/data/\n",
143
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/data/.gitignore \n",
144
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/gen_test_cases.py \n",
145
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/notify.py \n",
146
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/parse-res.py \n",
147
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/pipeline.py \n",
148
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-judgments.py \n",
149
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-rest.py \n",
150
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-test.py \n",
151
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/set-diff.py \n",
152
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/shuffle-test.py \n",
153
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/update-test.py \n",
154
+ " creating: TGIF-Release-master/code/crowdflower/front-end/layout/\n",
155
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/layout/.gitignore \n",
156
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/forgive.js \n",
157
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/instructions.md \n",
158
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/main.html \n",
159
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/main.js \n",
160
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/view.css \n",
161
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/\n",
162
+ " extracting: TGIF-Release-master/code/crowdflower/table3-rating/.gitignore \n",
163
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/requirements.txt \n",
164
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/routes.py \n",
165
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/static/\n",
166
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/static/view.css \n",
167
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/templates/\n",
168
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/templates/_formhelper.html \n",
169
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/templates/submit.html \n",
170
+ " creating: TGIF-Release-master/code/gif2txt-lstm/\n",
171
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/README.md \n",
172
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/caffe-rnn.patch \n",
173
+ " creating: TGIF-Release-master/code/gif2txt-lstm/models/\n",
174
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/models/README.md \n",
175
+ " creating: TGIF-Release-master/code/gifs-filter/\n",
176
+ " extracting: TGIF-Release-master/code/gifs-filter/.gitignore \n",
177
+ " inflating: TGIF-Release-master/code/gifs-filter/.gitmodules \n",
178
+ " inflating: TGIF-Release-master/code/gifs-filter/README.md \n",
179
+ " creating: TGIF-Release-master/code/gifs-filter/adult-filter/\n",
180
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/README.md \n",
181
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/filter.sh \n",
182
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/gen-raw.sh \n",
183
+ " creating: TGIF-Release-master/code/gifs-filter/adult-filter/keywords/\n",
184
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/keywords/README.md \n",
185
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/parse-tsv.py \n",
186
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/\n",
187
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/README.md \n",
188
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/cluster-by-tags.py \n",
189
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/dump-tags.py \n",
190
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter-images.py \n",
191
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter-text.py \n",
192
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter_tags.py \n",
193
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/giftypes/\n",
194
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/giftypes/c3d-models-rfc.pkl \n",
195
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/no-motion/\n",
196
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/no-motion/c3d-models-rfc.pkl \n",
197
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/predict.py \n",
198
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/rank_tags.py \n",
199
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/setdiff.py \n",
200
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/setinter.py \n",
201
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/tag_rules \n",
202
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/train.py \n",
203
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d.patch \n",
204
+ " creating: TGIF-Release-master/code/gifs-filter/c3d/\n",
205
+ " extracting: TGIF-Release-master/code/gifs-filter/c3d/.gitignore \n",
206
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/README.md \n",
207
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/agg_feat.py \n",
208
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/build_deploy.py \n",
209
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/c3d.sh \n",
210
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/deploy.prototxt.in \n",
211
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/\n",
212
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/.gitignore \n",
213
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/README.md \n",
214
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/agg-hash.py \n",
215
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/build.sh \n",
216
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/cluster-pairs.py \n",
217
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dedup-v2.sh \n",
218
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dedup.sh \n",
219
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dump-nd.py \n",
220
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/extract_hash.sh \n",
221
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/extract_mhhash.cpp \n",
222
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/filter-cluster.py \n",
223
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/match-hash.sh \n",
224
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/mih/\n",
225
+ " extracting: TGIF-Release-master/code/gifs-filter/dedup/mih/README.md \n",
226
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/pHash-0.9.6/\n",
227
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/pHash-0.9.6/README.md \n",
228
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/store-hash.sh \n",
229
+ " inflating: TGIF-Release-master/code/gifs-filter/email_notify.py \n",
230
+ " inflating: TGIF-Release-master/code/gifs-filter/full.sh \n",
231
+ " inflating: TGIF-Release-master/code/gifs-filter/gen_set.py \n",
232
+ " inflating: TGIF-Release-master/code/gifs-filter/monitor-api.sh \n",
233
+ " inflating: TGIF-Release-master/code/gifs-filter/monitor.sh \n",
234
+ " inflating: TGIF-Release-master/code/gifs-filter/pipeline.sh \n",
235
+ " inflating: TGIF-Release-master/code/gifs-filter/prepare-data.sh \n",
236
+ " inflating: TGIF-Release-master/code/gifs-filter/requirements.txt \n",
237
+ " inflating: TGIF-Release-master/code/gifs-filter/review-CF.py \n",
238
+ " inflating: TGIF-Release-master/code/gifs-filter/split-batches.sh \n",
239
+ " creating: TGIF-Release-master/code/gifs-filter/test/\n",
240
+ " extracting: TGIF-Release-master/code/gifs-filter/test/.gitignore \n",
241
+ " inflating: TGIF-Release-master/code/gifs-filter/test/gif.urls \n",
242
+ " creating: TGIF-Release-master/code/gifs-filter/text-score/\n",
243
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/.gitignore \n",
244
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/Makefile \n",
245
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/README.md \n",
246
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/debug.sh \n",
247
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/filter.hpp \n",
248
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/group_area.hpp \n",
249
+ " creating: TGIF-Release-master/code/gifs-filter/text-score/test/\n",
250
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/README.md \n",
251
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/benchmark.sh \n",
252
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/neg.urls \n",
253
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/pos.urls \n",
254
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/text-score.cpp \n",
255
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/text-score.sh \n",
256
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/textdetection.cpp \n",
257
+ " creating: TGIF-Release-master/data/\n",
258
+ " extracting: TGIF-Release-master/data/.gitignore \n",
259
+ " creating: TGIF-Release-master/data/GIF2Movie/\n",
260
+ " inflating: TGIF-Release-master/data/GIF2Movie/M-VAD.tsv \n",
261
+ " inflating: TGIF-Release-master/data/GIF2Movie/MPII-MD.tsv \n",
262
+ " inflating: TGIF-Release-master/data/README.md \n",
263
+ " creating: TGIF-Release-master/data/coco-caption/\n",
264
+ " inflating: TGIF-Release-master/data/eval.py \n",
265
+ " inflating: TGIF-Release-master/data/results-lstm-cnn-finetune-cvpr16.tsv \n",
266
+ " creating: TGIF-Release-master/data/splits/\n",
267
+ " extracting: TGIF-Release-master/data/splits/.gitignore \n",
268
+ " inflating: TGIF-Release-master/data/splits/test.txt \n",
269
+ " inflating: TGIF-Release-master/data/splits/train.txt \n",
270
+ " inflating: TGIF-Release-master/data/splits/val.txt \n",
271
+ " inflating: TGIF-Release-master/data/tgif-v1.0.tsv \n",
272
+ " creating: TGIF-Release-master/docs/\n",
273
+ " creating: TGIF-Release-master/docs/_includes/\n",
274
+ " inflating: TGIF-Release-master/docs/_includes/authors.html \n",
275
+ " inflating: TGIF-Release-master/docs/_includes/download.html \n",
276
+ " inflating: TGIF-Release-master/docs/_includes/examples.html \n",
277
+ " inflating: TGIF-Release-master/docs/_includes/footer.html \n",
278
+ " inflating: TGIF-Release-master/docs/_includes/head.html \n",
279
+ " inflating: TGIF-Release-master/docs/_includes/header.html \n",
280
+ " inflating: TGIF-Release-master/docs/_includes/nav.html \n",
281
+ " inflating: TGIF-Release-master/docs/_includes/overview.html \n",
282
+ " creating: TGIF-Release-master/docs/_layouts/\n",
283
+ " inflating: TGIF-Release-master/docs/_layouts/default.html \n",
284
+ " creating: TGIF-Release-master/docs/css/\n",
285
+ " inflating: TGIF-Release-master/docs/css/main.scss \n",
286
+ " extracting: TGIF-Release-master/docs/index.html \n",
287
+ " creating: TGIF-Release-master/docs/js/\n",
288
+ " inflating: TGIF-Release-master/docs/js/main.js \n"
289
+ ]
290
+ }
291
+ ],
292
+ "source": [
293
+ "# Use unzip to extract the master.zip file\n",
294
+ "!unzip master.zip"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "metadata": {
300
+ "id": "7agJKFkZ6UGB"
301
+ },
302
+ "source": [
303
+ "## Explore the Dataset"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "metadata": {},
309
+ "source": [
310
+ "Now let's explore the downloaded files. The data we want is in *tgif-v1.0.tsv* file in the *data* folder. We can use *pandas* library to open the file. We need to set delimiter as `\\t` as the file contains tab separated values."
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 4,
316
+ "metadata": {
317
+ "id": "1rwBQ3I2Ye7c"
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "import pandas as pd"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": 5,
327
+ "metadata": {
328
+ "id": "K8RvBSYSbvUb"
329
+ },
330
+ "outputs": [
331
+ {
332
+ "data": {
333
+ "text/html": [
334
+ "<div>\n",
335
+ "<style scoped>\n",
336
+ " .dataframe tbody tr th:only-of-type {\n",
337
+ " vertical-align: middle;\n",
338
+ " }\n",
339
+ "\n",
340
+ " .dataframe tbody tr th {\n",
341
+ " vertical-align: top;\n",
342
+ " }\n",
343
+ "\n",
344
+ " .dataframe thead th {\n",
345
+ " text-align: right;\n",
346
+ " }\n",
347
+ "</style>\n",
348
+ "<table border=\"1\" class=\"dataframe\">\n",
349
+ " <thead>\n",
350
+ " <tr style=\"text-align: right;\">\n",
351
+ " <th></th>\n",
352
+ " <th>url</th>\n",
353
+ " <th>description</th>\n",
354
+ " </tr>\n",
355
+ " </thead>\n",
356
+ " <tbody>\n",
357
+ " <tr>\n",
358
+ " <th>0</th>\n",
359
+ " <td>https://38.media.tumblr.com/9f6c25cc350f12aa74...</td>\n",
360
+ " <td>a man is glaring, and someone with sunglasses ...</td>\n",
361
+ " </tr>\n",
362
+ " <tr>\n",
363
+ " <th>1</th>\n",
364
+ " <td>https://38.media.tumblr.com/9ead028ef62004ef6a...</td>\n",
365
+ " <td>a cat tries to catch a mouse on a tablet</td>\n",
366
+ " </tr>\n",
367
+ " <tr>\n",
368
+ " <th>2</th>\n",
369
+ " <td>https://38.media.tumblr.com/9f43dc410be85b1159...</td>\n",
370
+ " <td>a man dressed in red is dancing.</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>3</th>\n",
374
+ " <td>https://38.media.tumblr.com/9f659499c8754e40cf...</td>\n",
375
+ " <td>an animal comes close to another in the jungle</td>\n",
376
+ " </tr>\n",
377
+ " <tr>\n",
378
+ " <th>4</th>\n",
379
+ " <td>https://38.media.tumblr.com/9ed1c99afa7d714118...</td>\n",
380
+ " <td>a man in a hat adjusts his tie and makes a wei...</td>\n",
381
+ " </tr>\n",
382
+ " </tbody>\n",
383
+ "</table>\n",
384
+ "</div>"
385
+ ],
386
+ "text/plain": [
387
+ " url \\\n",
388
+ "0 https://38.media.tumblr.com/9f6c25cc350f12aa74... \n",
389
+ "1 https://38.media.tumblr.com/9ead028ef62004ef6a... \n",
390
+ "2 https://38.media.tumblr.com/9f43dc410be85b1159... \n",
391
+ "3 https://38.media.tumblr.com/9f659499c8754e40cf... \n",
392
+ "4 https://38.media.tumblr.com/9ed1c99afa7d714118... \n",
393
+ "\n",
394
+ " description \n",
395
+ "0 a man is glaring, and someone with sunglasses ... \n",
396
+ "1 a cat tries to catch a mouse on a tablet \n",
397
+ "2 a man dressed in red is dancing. \n",
398
+ "3 an animal comes close to another in the jungle \n",
399
+ "4 a man in a hat adjusts his tie and makes a wei... "
400
+ ]
401
+ },
402
+ "execution_count": 5,
403
+ "metadata": {},
404
+ "output_type": "execute_result"
405
+ }
406
+ ],
407
+ "source": [
408
+ "# Load dataset to a pandas dataframe\n",
409
+ "df = pd.read_csv(\n",
410
+ " \"./TGIF-Release-master/data/tgif-v1.0.tsv\",\n",
411
+ " delimiter=\"\\t\",\n",
412
+ " names=['url', 'description']\n",
413
+ ")\n",
414
+ "df.head()"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "markdown",
419
+ "metadata": {},
420
+ "source": [
421
+ "*Note the dataset does not contain the actual GIF files. But it has URLs we can use to download/access the GIF files. This is great as we do not need to store/download all the GIF files. We can directly load the required GIF files using the URL when displaying the search results.*\n",
422
+ "\n",
423
+ "There are some duplicate descriptions in the dataset."
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 6,
429
+ "metadata": {},
430
+ "outputs": [
431
+ {
432
+ "data": {
433
+ "text/plain": [
434
+ "125782"
435
+ ]
436
+ },
437
+ "execution_count": 6,
438
+ "metadata": {},
439
+ "output_type": "execute_result"
440
+ }
441
+ ],
442
+ "source": [
443
+ "len(df)"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 7,
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/plain": [
454
+ "102068"
455
+ ]
456
+ },
457
+ "execution_count": 7,
458
+ "metadata": {},
459
+ "output_type": "execute_result"
460
+ }
461
+ ],
462
+ "source": [
463
+ "# Number of *unique* GIFs in the dataset\n",
464
+ "len(df[\"url\"].unique())"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 8,
470
+ "metadata": {},
471
+ "outputs": [
472
+ {
473
+ "data": {
474
+ "text/plain": [
475
+ "https://38.media.tumblr.com/ddbfe51aff57fd8446f49546bc027bd7/tumblr_nowv0v6oWj1uwbrato1_500.gif 4\n",
476
+ "https://33.media.tumblr.com/46c873a60bb8bd97bdc253b826d1d7a1/tumblr_nh7vnlXEvL1u6fg3no1_500.gif 4\n",
477
+ "https://38.media.tumblr.com/b544f3c87cbf26462dc267740bb1c842/tumblr_n98uooxl0K1thiyb6o1_250.gif 4\n",
478
+ "https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif 4\n",
479
+ "https://31.media.tumblr.com/69bca8520e1f03b4148dde2ac78469ec/tumblr_npvi0kW4OD1urqm0mo1_400.gif 4\n",
480
+ "Name: url, dtype: int64"
481
+ ]
482
+ },
483
+ "execution_count": 8,
484
+ "metadata": {},
485
+ "output_type": "execute_result"
486
+ }
487
+ ],
488
+ "source": [
489
+ "dupes = df['url'].value_counts().sort_values(ascending=False)\n",
490
+ "dupes.head()"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "markdown",
495
+ "metadata": {},
496
+ "source": [
497
+ "Let's take a look at one of these duplicated URLs and it's descriptions."
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "code",
502
+ "execution_count": 9,
503
+ "metadata": {},
504
+ "outputs": [
505
+ {
506
+ "data": {
507
+ "text/html": [
508
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
509
+ ],
510
+ "text/plain": [
511
+ "<IPython.core.display.HTML object>"
512
+ ]
513
+ },
514
+ "execution_count": 9,
515
+ "metadata": {},
516
+ "output_type": "execute_result"
517
+ },
518
+ {
519
+ "name": "stdout",
520
+ "output_type": "stream",
521
+ "text": [
522
+ "two girls are singing music pop in a concert\n"
523
+ ]
524
+ },
525
+ {
526
+ "data": {
527
+ "text/html": [
528
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
529
+ ],
530
+ "text/plain": [
531
+ "<IPython.core.display.HTML object>"
532
+ ]
533
+ },
534
+ "execution_count": 9,
535
+ "metadata": {},
536
+ "output_type": "execute_result"
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "a woman sings sang girl on a stage singing\n"
543
+ ]
544
+ },
545
+ {
546
+ "data": {
547
+ "text/html": [
548
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
549
+ ],
550
+ "text/plain": [
551
+ "<IPython.core.display.HTML object>"
552
+ ]
553
+ },
554
+ "execution_count": 9,
555
+ "metadata": {},
556
+ "output_type": "execute_result"
557
+ },
558
+ {
559
+ "name": "stdout",
560
+ "output_type": "stream",
561
+ "text": [
562
+ "two girls on a stage sing into microphones.\n"
563
+ ]
564
+ },
565
+ {
566
+ "data": {
567
+ "text/html": [
568
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
569
+ ],
570
+ "text/plain": [
571
+ "<IPython.core.display.HTML object>"
572
+ ]
573
+ },
574
+ "execution_count": 9,
575
+ "metadata": {},
576
+ "output_type": "execute_result"
577
+ },
578
+ {
579
+ "name": "stdout",
580
+ "output_type": "stream",
581
+ "text": [
582
+ "two girls dressed in black are singing.\n"
583
+ ]
584
+ }
585
+ ],
586
+ "source": [
587
+ "dupe_url = \"https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif\"\n",
588
+ "dupe_df = df[df['url'] == dupe_url]\n",
589
+ "\n",
590
+ "# let's take a look at this GIF and it's duplicated descriptions\n",
591
+ "for _, gif in dupe_df.iterrows():\n",
592
+ " HTML(f\"<img src={gif['url']} style='width:120px; height:90px'>\")\n",
593
+ " print(gif[\"description\"])"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "markdown",
598
+ "metadata": {},
599
+ "source": [
600
+ "There is no reason for us to remove these duplicates, as shown here, every description is accurate. You can spot check a few of the other URLs but they all seem to be the same where we have several *accurate* descriptions for a single GIF."
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "markdown",
605
+ "metadata": {},
606
+ "source": [
607
+ "That leaves us with 125,781 descriptions for 102,067 GIFs. We will use these descriptions to create *context* vectors that will be indexed in a vector database to create our GIF search tool. Let's take a look at a few more examples of GIFs and their descriptions."
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": 10,
613
+ "metadata": {
614
+ "colab": {
615
+ "base_uri": "https://localhost:8080/",
616
+ "height": 577
617
+ },
618
+ "id": "m0_jfDW6hl4C",
619
+ "outputId": "bcfb0ae3-4c44-4354-e42d-93a3ee35ff2d"
620
+ },
621
+ "outputs": [
622
+ {
623
+ "data": {
624
+ "text/html": [
625
+ "<img src=https://38.media.tumblr.com/9f6c25cc350f12aa74a7dc386a5c4985/tumblr_mevmyaKtDf1rgvhr8o1_500.gif style='width:120px; height:90px'>"
626
+ ],
627
+ "text/plain": [
628
+ "<IPython.core.display.HTML object>"
629
+ ]
630
+ },
631
+ "execution_count": 10,
632
+ "metadata": {},
633
+ "output_type": "execute_result"
634
+ },
635
+ {
636
+ "name": "stdout",
637
+ "output_type": "stream",
638
+ "text": [
639
+ "a man is glaring, and someone with sunglasses appears.\n"
640
+ ]
641
+ },
642
+ {
643
+ "data": {
644
+ "text/html": [
645
+ "<img src=https://38.media.tumblr.com/9ead028ef62004ef6ac2b92e52edd210/tumblr_nok4eeONTv1s2yegdo1_400.gif style='width:120px; height:90px'>"
646
+ ],
647
+ "text/plain": [
648
+ "<IPython.core.display.HTML object>"
649
+ ]
650
+ },
651
+ "execution_count": 10,
652
+ "metadata": {},
653
+ "output_type": "execute_result"
654
+ },
655
+ {
656
+ "name": "stdout",
657
+ "output_type": "stream",
658
+ "text": [
659
+ "a cat tries to catch a mouse on a tablet\n"
660
+ ]
661
+ },
662
+ {
663
+ "data": {
664
+ "text/html": [
665
+ "<img src=https://38.media.tumblr.com/9f43dc410be85b1159d1f42663d811d7/tumblr_mllh01J96X1s9npefo1_250.gif style='width:120px; height:90px'>"
666
+ ],
667
+ "text/plain": [
668
+ "<IPython.core.display.HTML object>"
669
+ ]
670
+ },
671
+ "execution_count": 10,
672
+ "metadata": {},
673
+ "output_type": "execute_result"
674
+ },
675
+ {
676
+ "name": "stdout",
677
+ "output_type": "stream",
678
+ "text": [
679
+ "a man dressed in red is dancing.\n"
680
+ ]
681
+ },
682
+ {
683
+ "data": {
684
+ "text/html": [
685
+ "<img src=https://38.media.tumblr.com/9f659499c8754e40cf3f7ac21d08dae6/tumblr_nqlr0rn8ox1r2r0koo1_400.gif style='width:120px; height:90px'>"
686
+ ],
687
+ "text/plain": [
688
+ "<IPython.core.display.HTML object>"
689
+ ]
690
+ },
691
+ "execution_count": 10,
692
+ "metadata": {},
693
+ "output_type": "execute_result"
694
+ },
695
+ {
696
+ "name": "stdout",
697
+ "output_type": "stream",
698
+ "text": [
699
+ "an animal comes close to another in the jungle\n"
700
+ ]
701
+ },
702
+ {
703
+ "data": {
704
+ "text/html": [
705
+ "<img src=https://38.media.tumblr.com/9ed1c99afa7d71411884101cb054f35f/tumblr_mvtuwlhSkE1qbnleeo1_500.gif style='width:120px; height:90px'>"
706
+ ],
707
+ "text/plain": [
708
+ "<IPython.core.display.HTML object>"
709
+ ]
710
+ },
711
+ "execution_count": 10,
712
+ "metadata": {},
713
+ "output_type": "execute_result"
714
+ },
715
+ {
716
+ "name": "stdout",
717
+ "output_type": "stream",
718
+ "text": [
719
+ "a man in a hat adjusts his tie and makes a weird face.\n"
720
+ ]
721
+ }
722
+ ],
723
+ "source": [
724
+ "for _, gif in df[:5].iterrows():\n",
725
+ " HTML(f\"<img src={gif['url']} style='width:120px; height:90px'>\")\n",
726
+ " print(gif[\"description\"])"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "markdown",
731
+ "metadata": {},
732
+ "source": [
733
+ "We can see that the description of the GIF accurately describes what is happening in the GIF, we can use these descriptions to search through our GIFs."
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "markdown",
738
+ "metadata": {},
739
+ "source": [
740
+ "Using this data, we can build the GIF search tool with just *two* components:\n",
741
+ "\n",
742
+ "* a **retriever** to embed GIF descriptions\n",
743
+ "* a **vector database** to store GIF description embeddings and retrieve relevant GIFs"
744
+ ]
745
+ },
746
+ {
747
+ "cell_type": "markdown",
748
+ "metadata": {
749
+ "id": "zrKIRGeo6ehR"
750
+ },
751
+ "source": [
752
+ "## Initialize Pinecone Index"
753
+ ]
754
+ },
755
+ {
756
+ "cell_type": "markdown",
757
+ "metadata": {},
758
+ "source": [
759
+ "The vector database stores vector representations of our GIF descriptions which we can retrieve using another vector (query vector). We will use the Pinecone vector database, a fully managed vector database that can store and search through billions of records in milliseconds. You could use any other vector database such as FAISS to build this tool. But you may need to manage the database yourself.\n",
760
+ "\n",
761
+ "To initialize the database, we sign up for a [free Pinecone API key](https://app.pinecone.io/) and `pip install pinecone-client`. Once ready, we initialize our index with:"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": 12,
767
+ "metadata": {
768
+ "id": "Ngbs8wQQoePL"
769
+ },
770
+ "outputs": [],
771
+ "source": [
772
+ "import pinecone\n",
773
+ "\n",
774
+ "# Connect to pinecone environment\n",
775
+ "pinecone.init(\n",
776
+ " api_key=\"<<YOUR_API_KEY>>\",\n",
777
+ " environment=\"us-west1-gcp\"\n",
778
+ ")\n",
779
+ "\n",
780
+ "index_name = 'gif-search'\n",
781
+ "\n",
782
+ "# check if the gif-search exists\n",
783
+ "if index_name not in pinecone.list_indexes():\n",
784
+ " # create the index if it does not exist\n",
785
+ " pinecone.create_index(\n",
786
+ " index_name,\n",
787
+ " dimension=384,\n",
788
+ " metric=\"cosine\"\n",
789
+ " )\n",
790
+ "\n",
791
+ "# Connect to gif-search index we created\n",
792
+ "index = pinecone.Index(index_name)"
793
+ ]
794
+ },
795
+ {
796
+ "cell_type": "markdown",
797
+ "metadata": {},
798
+ "source": [
799
+ "Here we specify the name of the index where we will store our GIF descriptions and their URLs, the similarity metric, and the embedding dimension of the vectors. The similarity metric and embedding dimension can change depending on the embedding model used. However, most retrievers use \"cosine\" and 768."
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "markdown",
804
+ "metadata": {
805
+ "id": "D5mGU3ub6kkb"
806
+ },
807
+ "source": [
808
+ "## Initialize Retriever"
809
+ ]
810
+ },
811
+ {
812
+ "cell_type": "markdown",
813
+ "metadata": {},
814
+ "source": [
815
+ "Next, we need to initialize our retriever. The retriever will mainly do two things:\n",
816
+ "\n",
817
+ "1.\tGenerate embeddings for all the GIF descriptions (context vectors/embeddings)\n",
818
+ "2.\tGenerate embeddings for the query (query vector/embedding)\n",
819
+ "\n",
820
+ "The retriever will generate the embeddings in a way that the queries and GIF descriptions with similar meanings are in a similar vector space. Then we can use cosine similarity to calculate this similarity between the query and context embeddings and find the most relevant GIF to our query.\n",
821
+ "\n",
822
+ "We will use a `SentenceTransformer` model trained based on Microsoft's MPNet as our retriever. This model performs well out-of-the-box when searching based on generic semantic similarity. "
823
+ ]
824
+ },
825
+ {
826
+ "cell_type": "code",
827
+ "execution_count": 13,
828
+ "metadata": {
829
+ "id": "jtqu5O9Y6q8x"
830
+ },
831
+ "outputs": [
832
+ {
833
+ "name": "stderr",
834
+ "output_type": "stream",
835
+ "text": [
836
+ "2022-08-13 16:42:37.258365: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n"
837
+ ]
838
+ }
839
+ ],
840
+ "source": [
841
+ "from sentence_transformers import SentenceTransformer"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "code",
846
+ "execution_count": 14,
847
+ "metadata": {
848
+ "colab": {
849
+ "base_uri": "https://localhost:8080/",
850
+ "height": 465,
851
+ "referenced_widgets": [
852
+ "3bbdb7ac9ddb4e61acbf10e0e322b464",
853
+ "425825b26a384e158608f34c327e7be7",
854
+ "9df1cc108ad74d84a52b25ca0e835197",
855
+ "d9e3f2ddbf5e47ebb86fb20c39079354",
856
+ "fe02dbbb561147198c3278388cb40d04",
857
+ "02851f413fa34d12b412e036d785d38e",
858
+ "1215630e8b53423390353cd56a044c6e",
859
+ "beeffeb083574f0a8c702b6035474073",
860
+ "a40d7b1c14944323937291f6a834061b",
861
+ "2063122bcf15496987749c8cca733a8b",
862
+ "02796d5819bd4b77b56c6f9cab93c908",
863
+ "b69d8608edce436c826f87e269a3b1ec",
864
+ "5e30b575e33843e9b68bc34a84c6e6b2",
865
+ "a6941924044b4956afa6c9b458141007",
866
+ "a2d42ce089df49b896ba9223f6df2ac2",
867
+ "b329d7774a1a4817a4e9cea7adfb288d",
868
+ "4bfd979f8e1a4bd889b846d6affb6d3f",
869
+ "d2088a6432ed4403b9602c5fecb09246",
870
+ "071f6d3e71694ddab08360cabf5e7ecd",
871
+ "f8c2899ce8264b7fa7d5552783c71ad3",
872
+ "850040d37e2b45ecaa53c82958dd9d6a",
873
+ "08779212fc3c46629ea7b5ae778b282b",
874
+ "97b11b05410f4b0682a223efd6eeb570",
875
+ "b5732b4221f448a8b8ae2363849214eb",
876
+ "c7c897d4905f4af28a4e0e8e72897f74",
877
+ "9066e6a9f5384ceaba6d58353c3e0a6a",
878
+ "5085c2de59b34ffe8902285e42f32401",
879
+ "8387420e74e94768893a7415243d4a12",
880
+ "b3fa9e20d0524f0bbbe65575ccd75126",
881
+ "24d25ddd24084a49b47f02138db4b592",
882
+ "b4e62b032b884838b46bcc6b62c0c3ca",
883
+ "a54f0fc3dc184e20bf7660ddbf890a76",
884
+ "b66d0aedfb414d4c8cac45e0f8d157ac",
885
+ "d6e028d4612241fba005c9dd3fe12d30",
886
+ "a765f4490ae44f959bc594bb3b3b719a",
887
+ "0496b4ff5eec45f08c5a1fa1ef106777",
888
+ "976ba55ccfc0419d9b357d83eb4e4ab9",
889
+ "de5a4a6c3495413fa200ab92d876f509",
890
+ "4fb7a30442f94068bdb7c9e434cd03b6",
891
+ "242c1203c62b42b9ae32af653267f8c7",
892
+ "780b68a97f184b8cb7a04e3412d050ab",
893
+ "f6685ce8e5a843f0b856c64a5b71c7ee",
894
+ "6d8f66edc20543b9baae3b474201861e",
895
+ "7d4faf89e1a34c3592a8d8b5c332cf9a",
896
+ "aaff54fb4ed64fbba356616ebad675c3",
897
+ "1b85eb15f8584fd5afceddd1ea3ea2e0",
898
+ "ba4b82c10e804d3ea76592f2d2a3832c",
899
+ "6d2b705a90a447579864e91618f70676",
900
+ "a0d67c99dee248f280cbe34fc3de48a6",
901
+ "42ca2987c03f46e9985a98b4a137f95a",
902
+ "3d044c94cbd741adbf826306bd882971",
903
+ "9a612735a6b94182bafe9a0b42648455",
904
+ "3965cf39aef641dbb2d0b2a362e1c8c2",
905
+ "369f1ee2f1844327b3074a31c0e12519",
906
+ "0e1839f9d1fc43b18f84d0f554a54cc2",
907
+ "272faddd67724961a8bdcccba6b196b0",
908
+ "922e13c3b6944e65a04d96147dd3c9ad",
909
+ "aac3d96ceb444486b6f7f760b43ffc79",
910
+ "04a7db113b11418ab275879f0f8bf162",
911
+ "fd337d66d3a14867ae76908cd58e313b",
912
+ "462563ff268e47d5a8c9efffe09604ed",
913
+ "abbefd5572204fb4b3c0bf7c5597eb8e",
914
+ "69e75d6d62494209b2ee828d1710a59e",
915
+ "45240e5ad3a6426bbfc5e3c1880ac781",
916
+ "67dfd1d8ff4043239f31bb3a53ce4ccc",
917
+ "6aa46bc05e6b4a139009a4124b0f80ec",
918
+ "9ab75ac0bf404362ab25e898b172110c",
919
+ "005d8360dc824d1f869d0eca3b3ec9fd",
920
+ "aa556238d9174c8e83187d079f229b97",
921
+ "65e7a017905442ae98c6415bcdfe0bb4",
922
+ "97302900f6604d7991df8c9854d95728",
923
+ "0ebe49597de647a5a4da64d03c471b76",
924
+ "40e00bdafe664e6082d5fc4392cbc4bc",
925
+ "32719d55aa0e49d7bb65e7755fc2f572",
926
+ "24eb84536c324805bf159da81d8ad509",
927
+ "6565c0393fee4725abb04e245a9a6fb0",
928
+ "2fe6b2e1c56b48fe9b056ee8d0d3697c",
929
+ "e6e9f5860acc4fdda68eafe6abbc99db",
930
+ "7f84fd3a40bb40dbb742101403e671c9",
931
+ "48e79feebdc043adae21f571319493db",
932
+ "2d07707632dc4c60bca7afbbeb928ca6",
933
+ "71600400fa14406bb81e2e410d116681",
934
+ "2540fac951cd4883833b586558c9081e",
935
+ "759f82584cc84eedbbf8fb6142b0d657",
936
+ "693abe44ba9a463ca9d08e1b6c54673b",
937
+ "024be013b90d4a309b2f54d979f1269a",
938
+ "0097b88603d8424f8fc1696b17bd165e",
939
+ "a1d9893a7c7345a49cd336610e0ba3f5",
940
+ "fefd20a4306b4dad840a79038db4c0a3",
941
+ "c985a33ed0064c589e78a72080c382e7",
942
+ "4e37e65238634a97a91e8449e24a99c2",
943
+ "0bb8029a6b2e49de98586fa091033387",
944
+ "fd631c1a5515475a873dceb33e21ac7f",
945
+ "2b86f4f2dbf34220ab607cb010ad3ef7",
946
+ "8da6bd57b0f44a65be01086f33313f2a",
947
+ "24f2ae769e2c4d829dffa14ca8928d3b",
948
+ "1e64de324773474d994258dbe9ed736f",
949
+ "c7f564bd62324a048511ff82b8d4c369",
950
+ "71e6d768200b48ddab9375d0266c6756",
951
+ "fbd414ab7af64c2f840d19d1315085c4",
952
+ "f2983bd01cab40469306b9bdea3b3b19",
953
+ "825ae4cc8891473f8feb1b2820b0c0aa",
954
+ "8d8f963d9b6c409d8c21e307958013d8",
955
+ "8f2d54ebde5341808e65a705f15ba036",
956
+ "1f7356b83d474a0f8ed49a4a5fae5d22",
957
+ "55e099fba0fb41f999f690884f80843a",
958
+ "2f0edbcbec1d44378f4bc17535a87b76",
959
+ "6c709e1a7d174633b603df7bf0804756",
960
+ "8b88e836a5884b3db7748d5971463e62",
961
+ "9cb993dbaa5746249fed2df7d8db677c",
962
+ "87b1f938065644cba745c0b0952526b5",
963
+ "9d46acccbf1347c1bfd7b61e84591812",
964
+ "8f240b5dc5274d63bb23ba9979d47163",
965
+ "783b444819ab4454bceb16578c71df39",
966
+ "b1c876108e6b4150bf3638fad1a56157",
967
+ "6afdd5eede004470a8c7eb8709f6182b",
968
+ "f4ae6887074f443299e95e6cebf971ce",
969
+ "b3fbdd74fd764c65a25817690c961de6",
970
+ "d141eee9a0f14702a82cfe1b194f9d89",
971
+ "eeffd24564624122afc795be6bd1131b",
972
+ "8075cad347314feabdb19d6902752b27",
973
+ "b3c2f3d5b805460496a74f2009aaf81c",
974
+ "4247478ac3b742a7a9407bc69ed179e6",
975
+ "d3abd49739b2493b8d52a062e90ae3d6",
976
+ "af1cdb5fdb31467aa54bc5b9c1ec255d",
977
+ "4124c49ef7d9497187b8e29af694d54f",
978
+ "1e7f1c9aa28e4dcd80f6b9c6b9c06fd6",
979
+ "73c159f67bec448aa102470f1abd0ca7",
980
+ "393b7df64cbd4072acf090f3dee2108c",
981
+ "f2c52012dfd84e39851e297d56669213",
982
+ "665cc77211d0466cbb2d4a2a95547547",
983
+ "70ca64c48bad4c42ba0208eecfc3e40e",
984
+ "176a01c7622143c38482371fe2f6cfaf",
985
+ "3bf92cfae4b547ce9f91e6b4dd94f25a",
986
+ "493c065a83ad48fc921b5b11c59c8fed",
987
+ "c0b2ebd2a44845e885863ee18a4e121e",
988
+ "c070c05c63ef48908126fac009b33ea4",
989
+ "33c35634d79f4848a32e6bd46cb1a75b",
990
+ "c0caab84bfcb4bb69ed70c73e5d20b59",
991
+ "95fa10da3a1742ea9fb3ae3bf948618a",
992
+ "00ba06783c7b484f91aceefd8be27782",
993
+ "8b9224015a1a466f982f6d51e1a8591d",
994
+ "e71eff0e7d2d4e6a9949c29395ac947d",
995
+ "a05a15c5b5a34a5cbc7a0647a01826fb",
996
+ "b4f4e484c3c94424a37f6746a590fffd",
997
+ "e993d4821d984becbb06458f62252f1d",
998
+ "93ba13309f954c669989147c0823c06b",
999
+ "4c5388f0293f4bf384332ba0bc775b1c",
1000
+ "4d2d0d8ce7594980aef48885259dafef",
1001
+ "da474d5e97ef4947936ca0dd34a30126",
1002
+ "d0f5166f81f141a88d282b4eb322ec4c",
1003
+ "98bd1a3b82e74f8ab86645815d8ca526",
1004
+ "8a06eaff96f94f2fb8f95fc92ef87651",
1005
+ "123ad9cff4a74609a452474d73c6738c"
1006
+ ]
1007
+ },
1008
+ "id": "UB0rVxmppnkm",
1009
+ "outputId": "cd8ed4e8-69a0-4ce9-c974-cda0dca998a8",
1010
+ "scrolled": true
1011
+ },
1012
+ "outputs": [
1013
+ {
1014
+ "data": {
1015
+ "application/vnd.jupyter.widget-view+json": {
1016
+ "model_id": "d45cf2e391ff4550bc6210a4145de5c7",
1017
+ "version_major": 2,
1018
+ "version_minor": 0
1019
+ },
1020
+ "text/plain": [
1021
+ "Downloading: 0%| | 0.00/1.18k [00:00<?, ?B/s]"
1022
+ ]
1023
+ },
1024
+ "metadata": {},
1025
+ "output_type": "display_data"
1026
+ },
1027
+ {
1028
+ "data": {
1029
+ "application/vnd.jupyter.widget-view+json": {
1030
+ "model_id": "ef4d186056694fcaaeee687144b08721",
1031
+ "version_major": 2,
1032
+ "version_minor": 0
1033
+ },
1034
+ "text/plain": [
1035
+ "Downloading: 0%| | 0.00/190 [00:00<?, ?B/s]"
1036
+ ]
1037
+ },
1038
+ "metadata": {},
1039
+ "output_type": "display_data"
1040
+ },
1041
+ {
1042
+ "data": {
1043
+ "application/vnd.jupyter.widget-view+json": {
1044
+ "model_id": "591d827499f148b19b02270a162f2c67",
1045
+ "version_major": 2,
1046
+ "version_minor": 0
1047
+ },
1048
+ "text/plain": [
1049
+ "Downloading: 0%| | 0.00/10.6k [00:00<?, ?B/s]"
1050
+ ]
1051
+ },
1052
+ "metadata": {},
1053
+ "output_type": "display_data"
1054
+ },
1055
+ {
1056
+ "data": {
1057
+ "application/vnd.jupyter.widget-view+json": {
1058
+ "model_id": "31bdc1880499479aa3ce39c6d4b72c23",
1059
+ "version_major": 2,
1060
+ "version_minor": 0
1061
+ },
1062
+ "text/plain": [
1063
+ "Downloading: 0%| | 0.00/612 [00:00<?, ?B/s]"
1064
+ ]
1065
+ },
1066
+ "metadata": {},
1067
+ "output_type": "display_data"
1068
+ },
1069
+ {
1070
+ "data": {
1071
+ "application/vnd.jupyter.widget-view+json": {
1072
+ "model_id": "ce5017b086284f6e8b5941d3a93b5fe3",
1073
+ "version_major": 2,
1074
+ "version_minor": 0
1075
+ },
1076
+ "text/plain": [
1077
+ "Downloading: 0%| | 0.00/116 [00:00<?, ?B/s]"
1078
+ ]
1079
+ },
1080
+ "metadata": {},
1081
+ "output_type": "display_data"
1082
+ },
1083
+ {
1084
+ "data": {
1085
+ "application/vnd.jupyter.widget-view+json": {
1086
+ "model_id": "4918c07d11a545e9badd68551c4702e5",
1087
+ "version_major": 2,
1088
+ "version_minor": 0
1089
+ },
1090
+ "text/plain": [
1091
+ "Downloading: 0%| | 0.00/39.3k [00:00<?, ?B/s]"
1092
+ ]
1093
+ },
1094
+ "metadata": {},
1095
+ "output_type": "display_data"
1096
+ },
1097
+ {
1098
+ "data": {
1099
+ "application/vnd.jupyter.widget-view+json": {
1100
+ "model_id": "e43629206fa9410fb64c8d21b7152734",
1101
+ "version_major": 2,
1102
+ "version_minor": 0
1103
+ },
1104
+ "text/plain": [
1105
+ "Downloading: 0%| | 0.00/349 [00:00<?, ?B/s]"
1106
+ ]
1107
+ },
1108
+ "metadata": {},
1109
+ "output_type": "display_data"
1110
+ },
1111
+ {
1112
+ "data": {
1113
+ "application/vnd.jupyter.widget-view+json": {
1114
+ "model_id": "0b5d2c9385084177a985459d2269bcaa",
1115
+ "version_major": 2,
1116
+ "version_minor": 0
1117
+ },
1118
+ "text/plain": [
1119
+ "Downloading: 0%| | 0.00/90.9M [00:00<?, ?B/s]"
1120
+ ]
1121
+ },
1122
+ "metadata": {},
1123
+ "output_type": "display_data"
1124
+ },
1125
+ {
1126
+ "data": {
1127
+ "application/vnd.jupyter.widget-view+json": {
1128
+ "model_id": "b545fcd0d6d8478e96e124e6c88e17f2",
1129
+ "version_major": 2,
1130
+ "version_minor": 0
1131
+ },
1132
+ "text/plain": [
1133
+ "Downloading: 0%| | 0.00/53.0 [00:00<?, ?B/s]"
1134
+ ]
1135
+ },
1136
+ "metadata": {},
1137
+ "output_type": "display_data"
1138
+ },
1139
+ {
1140
+ "data": {
1141
+ "application/vnd.jupyter.widget-view+json": {
1142
+ "model_id": "12b46a902b72488a9bcfa9052cb4f49c",
1143
+ "version_major": 2,
1144
+ "version_minor": 0
1145
+ },
1146
+ "text/plain": [
1147
+ "Downloading: 0%| | 0.00/112 [00:00<?, ?B/s]"
1148
+ ]
1149
+ },
1150
+ "metadata": {},
1151
+ "output_type": "display_data"
1152
+ },
1153
+ {
1154
+ "data": {
1155
+ "application/vnd.jupyter.widget-view+json": {
1156
+ "model_id": "2d9f462c654d489e8c4fd8a63a955890",
1157
+ "version_major": 2,
1158
+ "version_minor": 0
1159
+ },
1160
+ "text/plain": [
1161
+ "Downloading: 0%| | 0.00/466k [00:00<?, ?B/s]"
1162
+ ]
1163
+ },
1164
+ "metadata": {},
1165
+ "output_type": "display_data"
1166
+ },
1167
+ {
1168
+ "data": {
1169
+ "application/vnd.jupyter.widget-view+json": {
1170
+ "model_id": "72ca4bdbdf4d4f5f866c3889e7a804fa",
1171
+ "version_major": 2,
1172
+ "version_minor": 0
1173
+ },
1174
+ "text/plain": [
1175
+ "Downloading: 0%| | 0.00/350 [00:00<?, ?B/s]"
1176
+ ]
1177
+ },
1178
+ "metadata": {},
1179
+ "output_type": "display_data"
1180
+ },
1181
+ {
1182
+ "data": {
1183
+ "application/vnd.jupyter.widget-view+json": {
1184
+ "model_id": "54e316878bae41cfb9bb2296de7dba2b",
1185
+ "version_major": 2,
1186
+ "version_minor": 0
1187
+ },
1188
+ "text/plain": [
1189
+ "Downloading: 0%| | 0.00/13.2k [00:00<?, ?B/s]"
1190
+ ]
1191
+ },
1192
+ "metadata": {},
1193
+ "output_type": "display_data"
1194
+ },
1195
+ {
1196
+ "data": {
1197
+ "application/vnd.jupyter.widget-view+json": {
1198
+ "model_id": "b548bc12172749c6a31d391313b4c70a",
1199
+ "version_major": 2,
1200
+ "version_minor": 0
1201
+ },
1202
+ "text/plain": [
1203
+ "Downloading: 0%| | 0.00/232k [00:00<?, ?B/s]"
1204
+ ]
1205
+ },
1206
+ "metadata": {},
1207
+ "output_type": "display_data"
1208
+ },
1209
+ {
1210
+ "data": {
1211
+ "text/plain": [
1212
+ "SentenceTransformer(\n",
1213
+ " (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel \n",
1214
+ " (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n",
1215
+ " (2): Normalize()\n",
1216
+ ")"
1217
+ ]
1218
+ },
1219
+ "execution_count": 14,
1220
+ "metadata": {},
1221
+ "output_type": "execute_result"
1222
+ }
1223
+ ],
1224
+ "source": [
1225
+ "# Initialize retriever with SentenceTransformer model \n",
1226
+ "retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
1227
+ "retriever"
1228
+ ]
1229
+ },
1230
+ {
1231
+ "cell_type": "markdown",
1232
+ "metadata": {
1233
+ "id": "0l5JHT4K6xH9"
1234
+ },
1235
+ "source": [
1236
+ "## Generate Embeddings and Upsert"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "cell_type": "markdown",
1241
+ "metadata": {},
1242
+ "source": [
1243
+ "Now our retriever and the pinecone index are initialized. Next, we need to generate embeddings for the GIF descriptions. We will do this in batches to help us more quickly generate embeddings. This means our retriever will generate embeddings for 64 GIF descriptions at once instead of generating them individually (much faster) and send a single API call for each batch of 64 (also much faster).\n",
1244
+ "\n",
1245
+ "When passing the documents to pinecone, we need an id (a unique value), embedding (embeddings for the GIF descriptions we have generated earlier), and metadata for each document representing GIFs in the dataset. The metadata is a dictionary containing data relevant to our embeddings. For the GIF search tool, we only need the URL and description."
1246
+ ]
1247
+ },
1248
+ {
1249
+ "cell_type": "code",
1250
+ "execution_count": 16,
1251
+ "metadata": {},
1252
+ "outputs": [
1253
+ {
1254
+ "name": "stdout",
1255
+ "output_type": "stream",
1256
+ "text": [
1257
+ "cuda\n"
1258
+ ]
1259
+ }
1260
+ ],
1261
+ "source": [
1262
+ "import torch\n",
1263
+ "\n",
1264
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
1265
+ "\n",
1266
+ "print(device)"
1267
+ ]
1268
+ },
1269
+ {
1270
+ "cell_type": "code",
1271
+ "execution_count": 17,
1272
+ "metadata": {
1273
+ "colab": {
1274
+ "base_uri": "https://localhost:8080/",
1275
+ "height": 49,
1276
+ "referenced_widgets": [
1277
+ "fd9832ba430449d6aefdf4f56603751a",
1278
+ "f18bf9eeb43f4dbcb3ad0a237da6308b",
1279
+ "8493032e71ed4437b1f221c6fcaac709",
1280
+ "702ec6e219d74d99a6c929dd65f549ec",
1281
+ "94843930b2ab44438afe2875c44fff96",
1282
+ "304c08deba6248da82971665e0caa20b",
1283
+ "10e79e8c05db4b36b5103116d1210587",
1284
+ "3cc568020c5a4f34992eec0d195ae4f9",
1285
+ "08d4c47abb954cc7b277769cb86bb9a9",
1286
+ "ef45e711623c4f9ba27743de2db32e07",
1287
+ "9752dafd1edb478fba9060f29882c47b"
1288
+ ]
1289
+ },
1290
+ "id": "TdwtDfIxw7Ik",
1291
+ "outputId": "32811e16-1934-474d-bd82-a12fccf29cf9"
1292
+ },
1293
+ "outputs": [
1294
+ {
1295
+ "data": {
1296
+ "text/plain": [
1297
+ "SentenceTransformer(\n",
1298
+ " (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel \n",
1299
+ " (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n",
1300
+ " (2): Normalize()\n",
1301
+ ")"
1302
+ ]
1303
+ },
1304
+ "execution_count": 17,
1305
+ "metadata": {},
1306
+ "output_type": "execute_result"
1307
+ },
1308
+ {
1309
+ "data": {
1310
+ "application/vnd.jupyter.widget-view+json": {
1311
+ "model_id": "8c6dc1753afd496187d59ee23ca45744",
1312
+ "version_major": 2,
1313
+ "version_minor": 0
1314
+ },
1315
+ "text/plain": [
1316
+ " 0%| | 0/1966 [00:00<?, ?it/s]"
1317
+ ]
1318
+ },
1319
+ "metadata": {},
1320
+ "output_type": "display_data"
1321
+ },
1322
+ {
1323
+ "data": {
1324
+ "text/plain": [
1325
+ "{'dimension': 384,\n",
1326
+ " 'index_fullness': 0.1,\n",
1327
+ " 'namespaces': {'': {'vector_count': 125782}},\n",
1328
+ " 'totalVectorCount': 125782.0}"
1329
+ ]
1330
+ },
1331
+ "execution_count": 17,
1332
+ "metadata": {},
1333
+ "output_type": "execute_result"
1334
+ }
1335
+ ],
1336
+ "source": [
1337
+ "from tqdm.auto import tqdm\n",
1338
+ "\n",
1339
+ "# we will use batches of 64\n",
1340
+ "batch_size = 64\n",
1341
+ "# move device to CUDA if possible\n",
1342
+ "retriever.to(device)\n",
1343
+ "\n",
1344
+ "for i in tqdm(range(0, len(df), batch_size)):\n",
1345
+ " # find end of batch\n",
1346
+ " i_end = min(i+batch_size, len(df))\n",
1347
+ " # extract batch\n",
1348
+ " batch = df.iloc[i:i_end]\n",
1349
+ " # generate embeddings for batch\n",
1350
+ " emb = retriever.encode(batch['description'].tolist()).tolist()\n",
1351
+ " # get metadata\n",
1352
+ " meta = batch.to_dict(orient='records')\n",
1353
+ " # create IDs\n",
1354
+ " ids = [f\"{idx}\" for idx in range(i, i_end)]\n",
1355
+ " # add all to upsert list\n",
1356
+ " to_upsert = list(zip(ids, emb, meta))\n",
1357
+ " # upsert/insert these records to pinecone\n",
1358
+ " _ = index.upsert(vectors=to_upsert)\n",
1359
+ "\n",
1360
+ " \n",
1361
+ "# check that we have all vectors in index\n",
1362
+ "index.describe_index_stats()"
1363
+ ]
1364
+ },
1365
+ {
1366
+ "cell_type": "markdown",
1367
+ "metadata": {},
1368
+ "source": [
1369
+ "We can see all our documents are now in the pinecone index. Let's run some queries to test our GIF search tool."
1370
+ ]
1371
+ },
1372
+ {
1373
+ "cell_type": "markdown",
1374
+ "metadata": {
1375
+ "id": "nR5wJ9Ba59_Z"
1376
+ },
1377
+ "source": [
1378
+ "## Querying"
1379
+ ]
1380
+ },
1381
+ {
1382
+ "cell_type": "markdown",
1383
+ "metadata": {},
1384
+ "source": [
1385
+ "We have two functions, `search_gif`, to handle our search query, and `display_gif`, to display the search results.\n",
1386
+ "\n",
1387
+ "The `search_gif` function generates vector embedding for the search query using the retriever model and then runs the query on the pinecone index. `index.query` will compute the cosine similarity between the query embedding and the GIF description embeddings as we set the metric type as \"cosine\" when we initialize the pinecone index. The function will return the URL of the top 10 most relevant GIFs to our search query."
1388
+ ]
1389
+ },
1390
+ {
1391
+ "cell_type": "code",
1392
+ "execution_count": 18,
1393
+ "metadata": {
1394
+ "id": "FdHTSCo-6ElT"
1395
+ },
1396
+ "outputs": [],
1397
+ "source": [
1398
+ "def search_gif(query):\n",
1399
+ " # Generate embeddings for the query\n",
1400
+ " xq = retriever.encode(query).tolist()\n",
1401
+ " # Compute cosine similarity between query and embeddings vectors and return top 10 URls\n",
1402
+ " xc = index.query(xq, top_k=10,\n",
1403
+ " include_metadata=True)\n",
1404
+ " result = []\n",
1405
+ " for context in xc['matches']:\n",
1406
+ " url = context['metadata']['url']\n",
1407
+ " result.append(url)\n",
1408
+ " return result"
1409
+ ]
1410
+ },
1411
+ {
1412
+ "cell_type": "markdown",
1413
+ "metadata": {},
1414
+ "source": [
1415
+ "The `display_gif` can display multiple GIFs using its URLs in the jupyter notebook in a grid style. We use this function to display the top 10 GIFs returned by the `search_gif` function."
1416
+ ]
1417
+ },
1418
+ {
1419
+ "cell_type": "code",
1420
+ "execution_count": 19,
1421
+ "metadata": {
1422
+ "id": "PCi0cwnW7Y07"
1423
+ },
1424
+ "outputs": [],
1425
+ "source": [
1426
+ "def display_gif(urls):\n",
1427
+ " figures = []\n",
1428
+ " for url in urls:\n",
1429
+ " figures.append(f'''\n",
1430
+ " <figure style=\"margin: 5px !important;\">\n",
1431
+ " <img src=\"{url}\" style=\"width: 120px; height: 90px\" >\n",
1432
+ " </figure>\n",
1433
+ " ''')\n",
1434
+ " return HTML(data=f'''\n",
1435
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1436
+ " {''.join(figures)}\n",
1437
+ " </div>\n",
1438
+ " ''')"
1439
+ ]
1440
+ },
1441
+ {
1442
+ "cell_type": "markdown",
1443
+ "metadata": {},
1444
+ "source": [
1445
+ "Let's begin testing some queries."
1446
+ ]
1447
+ },
1448
+ {
1449
+ "cell_type": "code",
1450
+ "execution_count": 20,
1451
+ "metadata": {},
1452
+ "outputs": [
1453
+ {
1454
+ "data": {
1455
+ "text/html": [
1456
+ "\n",
1457
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1458
+ " \n",
1459
+ " <figure style=\"margin: 5px !important;\">\n",
1460
+ " <img src=\"https://38.media.tumblr.com/af53df8d946bbca23be97691db0ecd5e/tumblr_nq3l305zdF1s71nvbo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1461
+ " </figure>\n",
1462
+ " \n",
1463
+ " <figure style=\"margin: 5px !important;\">\n",
1464
+ " <img src=\"https://33.media.tumblr.com/a574ab035e7edc7708db423ee67f3ac4/tumblr_nq1zodZJNx1uoke7ao1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1465
+ " </figure>\n",
1466
+ " \n",
1467
+ " <figure style=\"margin: 5px !important;\">\n",
1468
+ " <img src=\"https://38.media.tumblr.com/94703ea885174ffc97c44d57487d7ee9/tumblr_na6oo2PKSC1silsr6o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1469
+ " </figure>\n",
1470
+ " \n",
1471
+ " <figure style=\"margin: 5px !important;\">\n",
1472
+ " <img src=\"https://38.media.tumblr.com/fa6a31e326066bb27776066150c8c810/tumblr_np38ipgJPd1tkkgpso1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1473
+ " </figure>\n",
1474
+ " \n",
1475
+ " <figure style=\"margin: 5px !important;\">\n",
1476
+ " <img src=\"https://38.media.tumblr.com/241d89939a5714c2db4566d9108245fe/tumblr_n9xv6aqQ5A1qmgppeo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1477
+ " </figure>\n",
1478
+ " \n",
1479
+ " <figure style=\"margin: 5px !important;\">\n",
1480
+ " <img src=\"https://31.media.tumblr.com/a00ae69f826dbe89a5bdabad567ac88d/tumblr_n8x5e6ZcFW1sjpl9lo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1481
+ " </figure>\n",
1482
+ " \n",
1483
+ " <figure style=\"margin: 5px !important;\">\n",
1484
+ " <img src=\"https://31.media.tumblr.com/28a9aac3c21941e1c61dd9ab4390c3f5/tumblr_nhdr3clKDa1sntw1mo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1485
+ " </figure>\n",
1486
+ " \n",
1487
+ " <figure style=\"margin: 5px !important;\">\n",
1488
+ " <img src=\"https://31.media.tumblr.com/5cbd531e1d8cc7fefffdb8a68ec62b1d/tumblr_naysx8YTzn1tzl1owo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1489
+ " </figure>\n",
1490
+ " \n",
1491
+ " <figure style=\"margin: 5px !important;\">\n",
1492
+ " <img src=\"https://38.media.tumblr.com/bc300fcbae8e4eb65c3901a246f46e4c/tumblr_niu5dzNP7G1u62tooo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1493
+ " </figure>\n",
1494
+ " \n",
1495
+ " <figure style=\"margin: 5px !important;\">\n",
1496
+ " <img src=\"https://38.media.tumblr.com/1c3edb33951b52020b9271185942b2b2/tumblr_nflm4phy0P1u4txqeo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1497
+ " </figure>\n",
1498
+ " \n",
1499
+ " </div>\n",
1500
+ " "
1501
+ ],
1502
+ "text/plain": [
1503
+ "<IPython.core.display.HTML object>"
1504
+ ]
1505
+ },
1506
+ "execution_count": 20,
1507
+ "metadata": {},
1508
+ "output_type": "execute_result"
1509
+ }
1510
+ ],
1511
+ "source": [
1512
+ "gifs = search_gif(\"a dog being confused\")\n",
1513
+ "display_gif(gifs)"
1514
+ ]
1515
+ },
1516
+ {
1517
+ "cell_type": "code",
1518
+ "execution_count": 21,
1519
+ "metadata": {},
1520
+ "outputs": [
1521
+ {
1522
+ "data": {
1523
+ "text/html": [
1524
+ "\n",
1525
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1526
+ " \n",
1527
+ " <figure style=\"margin: 5px !important;\">\n",
1528
+ " <img src=\"https://33.media.tumblr.com/73841eb3b37ad5277b324359a83bb19e/tumblr_ngnz25VQpD1twctp1o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1529
+ " </figure>\n",
1530
+ " \n",
1531
+ " <figure style=\"margin: 5px !important;\">\n",
1532
+ " <img src=\"https://38.media.tumblr.com/7b8ebff7051b8a7d0502294465559861/tumblr_na8n60gCmT1tiamw8o1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1533
+ " </figure>\n",
1534
+ " \n",
1535
+ " <figure style=\"margin: 5px !important;\">\n",
1536
+ " <img src=\"https://38.media.tumblr.com/49223a5564c8d7dfafe115063ba88c8a/tumblr_nnrps82EvG1sxvevjo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1537
+ " </figure>\n",
1538
+ " \n",
1539
+ " <figure style=\"margin: 5px !important;\">\n",
1540
+ " <img src=\"https://31.media.tumblr.com/aa9c98f92f06cc3484ae395194db6d7f/tumblr_naeyc6yQWy1tahfdeo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1541
+ " </figure>\n",
1542
+ " \n",
1543
+ " <figure style=\"margin: 5px !important;\">\n",
1544
+ " <img src=\"https://38.media.tumblr.com/e7a1d7ed5f2289db13e1812a91c0eedf/tumblr_nf8r2nWajt1s236zjo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1545
+ " </figure>\n",
1546
+ " \n",
1547
+ " <figure style=\"margin: 5px !important;\">\n",
1548
+ " <img src=\"https://38.media.tumblr.com/7d8f9cac33b4fc76908a37bf28ab6fca/tumblr_noswtqDMKC1tyncywo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1549
+ " </figure>\n",
1550
+ " \n",
1551
+ " <figure style=\"margin: 5px !important;\">\n",
1552
+ " <img src=\"https://38.media.tumblr.com/136d1d103edf3a82c2332bf8ef28d6d3/tumblr_nhm8rleyTk1u333yco2_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1553
+ " </figure>\n",
1554
+ " \n",
1555
+ " <figure style=\"margin: 5px !important;\">\n",
1556
+ " <img src=\"https://33.media.tumblr.com/0f97de4f3cc8dca408ca4ab036460412/tumblr_njmp6tj53K1thqmhto1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1557
+ " </figure>\n",
1558
+ " \n",
1559
+ " <figure style=\"margin: 5px !important;\">\n",
1560
+ " <img src=\"https://31.media.tumblr.com/be2b34de9ff751da15cbde3144d25007/tumblr_nh4oj86LJO1slj978o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1561
+ " </figure>\n",
1562
+ " \n",
1563
+ " <figure style=\"margin: 5px !important;\">\n",
1564
+ " <img src=\"https://38.media.tumblr.com/5f45e9a56121b070ddceca58b37e9ace/tumblr_njaggwVmdn1un7vpco1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1565
+ " </figure>\n",
1566
+ " \n",
1567
+ " </div>\n",
1568
+ " "
1569
+ ],
1570
+ "text/plain": [
1571
+ "<IPython.core.display.HTML object>"
1572
+ ]
1573
+ },
1574
+ "execution_count": 21,
1575
+ "metadata": {},
1576
+ "output_type": "execute_result"
1577
+ }
1578
+ ],
1579
+ "source": [
1580
+ "gifs = search_gif(\"animals being cute\")\n",
1581
+ "display_gif(gifs)"
1582
+ ]
1583
+ },
1584
+ {
1585
+ "cell_type": "code",
1586
+ "execution_count": 22,
1587
+ "metadata": {},
1588
+ "outputs": [
1589
+ {
1590
+ "data": {
1591
+ "text/html": [
1592
+ "\n",
1593
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1594
+ " \n",
1595
+ " <figure style=\"margin: 5px !important;\">\n",
1596
+ " <img src=\"https://38.media.tumblr.com/f15089751a2f2ac54db50f2cc0659fb0/tumblr_naenf9skQT1rggk0bo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1597
+ " </figure>\n",
1598
+ " \n",
1599
+ " <figure style=\"margin: 5px !important;\">\n",
1600
+ " <img src=\"https://38.media.tumblr.com/f9aaae6f9d2e2af17cb6deeb6ad15713/tumblr_n9lt3cvvy61trvs7go1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1601
+ " </figure>\n",
1602
+ " \n",
1603
+ " <figure style=\"margin: 5px !important;\">\n",
1604
+ " <img src=\"https://33.media.tumblr.com/61cf2e6af0e63d431cea07f1bf5c3fce/tumblr_nbm4g6rTLS1tl5a9oo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1605
+ " </figure>\n",
1606
+ " \n",
1607
+ " <figure style=\"margin: 5px !important;\">\n",
1608
+ " <img src=\"https://38.media.tumblr.com/d44de105f1b276baac01eb1c7715b9a0/tumblr_nf24sk7MDS1qkibgzo1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1609
+ " </figure>\n",
1610
+ " \n",
1611
+ " <figure style=\"margin: 5px !important;\">\n",
1612
+ " <img src=\"https://33.media.tumblr.com/97b9db062d617a6b909d5642374f44b9/tumblr_n92iyqDqO91s1eq7yo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1613
+ " </figure>\n",
1614
+ " \n",
1615
+ " <figure style=\"margin: 5px !important;\">\n",
1616
+ " <img src=\"https://31.media.tumblr.com/ee35386edeec1ff5f67faf89f2bb5a11/tumblr_nk1p5c17tS1ssdnnbo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1617
+ " </figure>\n",
1618
+ " \n",
1619
+ " <figure style=\"margin: 5px !important;\">\n",
1620
+ " <img src=\"https://33.media.tumblr.com/0c0ae10c9fc60b1bf2a204f16afcbd81/tumblr_ncaftzIchN1tg281go1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1621
+ " </figure>\n",
1622
+ " \n",
1623
+ " <figure style=\"margin: 5px !important;\">\n",
1624
+ " <img src=\"https://33.media.tumblr.com/1ed5b68ec5933ede5b2f9200256237c5/tumblr_niua9uhldr1u93r6so1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1625
+ " </figure>\n",
1626
+ " \n",
1627
+ " <figure style=\"margin: 5px !important;\">\n",
1628
+ " <img src=\"https://38.media.tumblr.com/e5d4347eb533aa06fd20238d8b326b1c/tumblr_nff7ifHJnr1solzopo1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1629
+ " </figure>\n",
1630
+ " \n",
1631
+ " <figure style=\"margin: 5px !important;\">\n",
1632
+ " <img src=\"https://37.media.tumblr.com/6d87ff4dc55ab7d2d31f1b40997f64d7/tumblr_n952rocZdV1t1o9tno1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1633
+ " </figure>\n",
1634
+ " \n",
1635
+ " </div>\n",
1636
+ " "
1637
+ ],
1638
+ "text/plain": [
1639
+ "<IPython.core.display.HTML object>"
1640
+ ]
1641
+ },
1642
+ "execution_count": 22,
1643
+ "metadata": {},
1644
+ "output_type": "execute_result"
1645
+ }
1646
+ ],
1647
+ "source": [
1648
+ "gifs = search_gif(\"people being angry\")\n",
1649
+ "display_gif(gifs)"
1650
+ ]
1651
+ },
1652
+ {
1653
+ "cell_type": "code",
1654
+ "execution_count": 23,
1655
+ "metadata": {},
1656
+ "outputs": [
1657
+ {
1658
+ "data": {
1659
+ "text/html": [
1660
+ "\n",
1661
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1662
+ " \n",
1663
+ " <figure style=\"margin: 5px !important;\">\n",
1664
+ " <img src=\"https://31.media.tumblr.com/2a42598230ce78b14c1e010b1b8bc1bf/tumblr_newfs5f3b21thysnvo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1665
+ " </figure>\n",
1666
+ " \n",
1667
+ " <figure style=\"margin: 5px !important;\">\n",
1668
+ " <img src=\"https://33.media.tumblr.com/b7d88c296436ce74217547853ecedfaf/tumblr_nqgr3t03ae1rk9zcyo1_1280.gif\" style=\"width: 120px; height: 90px\" >\n",
1669
+ " </figure>\n",
1670
+ " \n",
1671
+ " <figure style=\"margin: 5px !important;\">\n",
1672
+ " <img src=\"https://33.media.tumblr.com/3a567e5793b3c13efb7f7c4e4b359610/tumblr_nog7wyzRDQ1ty5qoho1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1673
+ " </figure>\n",
1674
+ " \n",
1675
+ " <figure style=\"margin: 5px !important;\">\n",
1676
+ " <img src=\"https://38.media.tumblr.com/66a2a7acc8ce515d58992bc0788b4540/tumblr_nee16ggWbQ1taytvao1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1677
+ " </figure>\n",
1678
+ " \n",
1679
+ " <figure style=\"margin: 5px !important;\">\n",
1680
+ " <img src=\"https://31.media.tumblr.com/88574dc6cf526445652dd569e5061b63/tumblr_n9izv8bCK31titpn6o1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1681
+ " </figure>\n",
1682
+ " \n",
1683
+ " <figure style=\"margin: 5px !important;\">\n",
1684
+ " <img src=\"https://38.media.tumblr.com/53d81b25b6b2e339084f194aa273babb/tumblr_mtxi16Fm3z1qzcey8o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1685
+ " </figure>\n",
1686
+ " \n",
1687
+ " <figure style=\"margin: 5px !important;\">\n",
1688
+ " <img src=\"https://33.media.tumblr.com/29eb4fdc45054e612b3b255fcc039cde/tumblr_nhki44Aivl1u7qbdlo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1689
+ " </figure>\n",
1690
+ " \n",
1691
+ " <figure style=\"margin: 5px !important;\">\n",
1692
+ " <img src=\"https://38.media.tumblr.com/504c94419a234c58b5f303b90d3dfae7/tumblr_nqpxss53VY1uus8ugo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1693
+ " </figure>\n",
1694
+ " \n",
1695
+ " <figure style=\"margin: 5px !important;\">\n",
1696
+ " <img src=\"https://38.media.tumblr.com/0aa084e3ec85f8b5f7936b576b431283/tumblr_no98qtaRMd1uvqm8go1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1697
+ " </figure>\n",
1698
+ " \n",
1699
+ " <figure style=\"margin: 5px !important;\">\n",
1700
+ " <img src=\"https://38.media.tumblr.com/80d5c2a1fd1fa51d2daea2cc4c8cbed6/tumblr_nerwreUW0p1tjon47o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1701
+ " </figure>\n",
1702
+ " \n",
1703
+ " </div>\n",
1704
+ " "
1705
+ ],
1706
+ "text/plain": [
1707
+ "<IPython.core.display.HTML object>"
1708
+ ]
1709
+ },
1710
+ "execution_count": 23,
1711
+ "metadata": {},
1712
+ "output_type": "execute_result"
1713
+ }
1714
+ ],
1715
+ "source": [
1716
+ "gifs = search_gif(\"a man dancing\")\n",
1717
+ "display_gif(gifs)"
1718
+ ]
1719
+ },
1720
+ {
1721
+ "cell_type": "code",
1722
+ "execution_count": 24,
1723
+ "metadata": {
1724
+ "colab": {
1725
+ "base_uri": "https://localhost:8080/",
1726
+ "height": 122
1727
+ },
1728
+ "id": "DGHLvLLQBizb",
1729
+ "outputId": "e92aa493-d3a4-4f76-f1b8-41313acd8100"
1730
+ },
1731
+ "outputs": [
1732
+ {
1733
+ "data": {
1734
+ "text/html": [
1735
+ "\n",
1736
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1737
+ " \n",
1738
+ " <figure style=\"margin: 5px !important;\">\n",
1739
+ " <img src=\"https://33.media.tumblr.com/b27107ead2c85c7adcdccd1581d5fcdb/tumblr_no8ub0S6pC1r09o0mo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1740
+ " </figure>\n",
1741
+ " \n",
1742
+ " <figure style=\"margin: 5px !important;\">\n",
1743
+ " <img src=\"https://33.media.tumblr.com/526aa008711e8a10b42f35a7f01723f1/tumblr_nfutjdMpTZ1r0kw6ko1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1744
+ " </figure>\n",
1745
+ " \n",
1746
+ " <figure style=\"margin: 5px !important;\">\n",
1747
+ " <img src=\"https://33.media.tumblr.com/3c4a71fcf5ab0c858253d26e433e51ea/tumblr_nbdszk9TUX1rik13to1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1748
+ " </figure>\n",
1749
+ " \n",
1750
+ " <figure style=\"margin: 5px !important;\">\n",
1751
+ " <img src=\"https://38.media.tumblr.com/697655681e81e048324104cee4ce91ae/tumblr_na83gnZpD71sohc0oo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1752
+ " </figure>\n",
1753
+ " \n",
1754
+ " <figure style=\"margin: 5px !important;\">\n",
1755
+ " <img src=\"https://33.media.tumblr.com/c45ddcedbd27c05e9b161ee728724b9e/tumblr_nowe9hyCN41s0uaixo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1756
+ " </figure>\n",
1757
+ " \n",
1758
+ " <figure style=\"margin: 5px !important;\">\n",
1759
+ " <img src=\"https://38.media.tumblr.com/9de2be73fb5b767092b5738fda438ed0/tumblr_noctf0galn1rqr3rbo1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1760
+ " </figure>\n",
1761
+ " \n",
1762
+ " <figure style=\"margin: 5px !important;\">\n",
1763
+ " <img src=\"https://38.media.tumblr.com/74cff22186ed3c906a6694d5820eec82/tumblr_nh2hjhz4Qs1tl5e30o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1764
+ " </figure>\n",
1765
+ " \n",
1766
+ " <figure style=\"margin: 5px !important;\">\n",
1767
+ " <img src=\"https://33.media.tumblr.com/13cff9f144a16121e8fb2506e0a9ae99/tumblr_n4p5c5LSZs1sqxmy5o1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1768
+ " </figure>\n",
1769
+ " \n",
1770
+ " <figure style=\"margin: 5px !important;\">\n",
1771
+ " <img src=\"https://38.media.tumblr.com/8f816d5f3e0ce4a29a3a52b183606eda/tumblr_n9bq3uQdab1tpa2reo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1772
+ " </figure>\n",
1773
+ " \n",
1774
+ " <figure style=\"margin: 5px !important;\">\n",
1775
+ " <img src=\"https://38.media.tumblr.com/6a1fb846930ded5339e93f3d93ed3d9e/tumblr_nasq1kuyzk1qew089o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1776
+ " </figure>\n",
1777
+ " \n",
1778
+ " </div>\n",
1779
+ " "
1780
+ ],
1781
+ "text/plain": [
1782
+ "<IPython.core.display.HTML object>"
1783
+ ]
1784
+ },
1785
+ "execution_count": 24,
1786
+ "metadata": {},
1787
+ "output_type": "execute_result"
1788
+ }
1789
+ ],
1790
+ "source": [
1791
+ "gifs = search_gif(\"a woman dancing\")\n",
1792
+ "display_gif(gifs)"
1793
+ ]
1794
+ },
1795
+ {
1796
+ "cell_type": "code",
1797
+ "execution_count": 25,
1798
+ "metadata": {
1799
+ "colab": {
1800
+ "base_uri": "https://localhost:8080/",
1801
+ "height": 122
1802
+ },
1803
+ "id": "4VTSoj8xL_u_",
1804
+ "outputId": "1c942c5e-7a21-4789-8ff4-ccbb3ffd0eb7"
1805
+ },
1806
+ "outputs": [
1807
+ {
1808
+ "data": {
1809
+ "text/html": [
1810
+ "\n",
1811
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1812
+ " \n",
1813
+ " <figure style=\"margin: 5px !important;\">\n",
1814
+ " <img src=\"https://33.media.tumblr.com/7ada83ae354be1d83ea4407fea789ab8/tumblr_na0e6razjV1s71nvbo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1815
+ " </figure>\n",
1816
+ " \n",
1817
+ " <figure style=\"margin: 5px !important;\">\n",
1818
+ " <img src=\"https://33.media.tumblr.com/3be31f4531ed041ff9b80465b56d810e/tumblr_nr0dycLuRO1useffdo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1819
+ " </figure>\n",
1820
+ " \n",
1821
+ " <figure style=\"margin: 5px !important;\">\n",
1822
+ " <img src=\"https://33.media.tumblr.com/f0edc38b8dacce783bebcdf41db55a93/tumblr_npapb2c4Wz1uolkubo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1823
+ " </figure>\n",
1824
+ " \n",
1825
+ " <figure style=\"margin: 5px !important;\">\n",
1826
+ " <img src=\"https://33.media.tumblr.com/2bf0f300d9ecfbcedf2dd3ba2b40b5e5/tumblr_ne5p2oTuCj1tdmffyo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1827
+ " </figure>\n",
1828
+ " \n",
1829
+ " <figure style=\"margin: 5px !important;\">\n",
1830
+ " <img src=\"https://38.media.tumblr.com/3e1f37fea789bb1508d40e8c30f791ae/tumblr_na3xfcUdnK1tiamx1o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1831
+ " </figure>\n",
1832
+ " \n",
1833
+ " <figure style=\"margin: 5px !important;\">\n",
1834
+ " <img src=\"https://38.media.tumblr.com/0b04187cb51a8889b0f41e5fbe390df2/tumblr_nbcltiDvcq1s7ri4yo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1835
+ " </figure>\n",
1836
+ " \n",
1837
+ " <figure style=\"margin: 5px !important;\">\n",
1838
+ " <img src=\"https://38.media.tumblr.com/e52a1d77dc0a679840a715c02035e5da/tumblr_nfbeo6Qqr91tl8fnfo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1839
+ " </figure>\n",
1840
+ " \n",
1841
+ " <figure style=\"margin: 5px !important;\">\n",
1842
+ " <img src=\"https://38.media.tumblr.com/a67f2f007b9881080aa3fe3584847bc5/tumblr_nc1wzyMaJP1tzj4j8o1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1843
+ " </figure>\n",
1844
+ " \n",
1845
+ " <figure style=\"margin: 5px !important;\">\n",
1846
+ " <img src=\"https://38.media.tumblr.com/61e9abf3681eeacea18dae288f084d62/tumblr_nbw9gwXM0e1tk2ngvo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1847
+ " </figure>\n",
1848
+ " \n",
1849
+ " <figure style=\"margin: 5px !important;\">\n",
1850
+ " <img src=\"https://31.media.tumblr.com/78355496a2ed41f0aa9fe855f9460bc3/tumblr_nais3s7sWa1s3att3o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1851
+ " </figure>\n",
1852
+ " \n",
1853
+ " </div>\n",
1854
+ " "
1855
+ ],
1856
+ "text/plain": [
1857
+ "<IPython.core.display.HTML object>"
1858
+ ]
1859
+ },
1860
+ "execution_count": 25,
1861
+ "metadata": {},
1862
+ "output_type": "execute_result"
1863
+ }
1864
+ ],
1865
+ "source": [
1866
+ "gifs = search_gif(\"an animal dancing\")\n",
1867
+ "display_gif(gifs)"
1868
+ ]
1869
+ },
1870
+ {
1871
+ "cell_type": "markdown",
1872
+ "metadata": {},
1873
+ "source": [
1874
+ "Let's describe the third GIF with the ginger dog dancing on his hind legs."
1875
+ ]
1876
+ },
1877
+ {
1878
+ "cell_type": "code",
1879
+ "execution_count": 26,
1880
+ "metadata": {},
1881
+ "outputs": [
1882
+ {
1883
+ "data": {
1884
+ "text/html": [
1885
+ "\n",
1886
+ " <div style=\"display: flex; flex-flow: row wrap; text-align: center;\">\n",
1887
+ " \n",
1888
+ " <figure style=\"margin: 5px !important;\">\n",
1889
+ " <img src=\"https://38.media.tumblr.com/a67f2f007b9881080aa3fe3584847bc5/tumblr_nc1wzyMaJP1tzj4j8o1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1890
+ " </figure>\n",
1891
+ " \n",
1892
+ " <figure style=\"margin: 5px !important;\">\n",
1893
+ " <img src=\"https://33.media.tumblr.com/2bf0f300d9ecfbcedf2dd3ba2b40b5e5/tumblr_ne5p2oTuCj1tdmffyo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1894
+ " </figure>\n",
1895
+ " \n",
1896
+ " <figure style=\"margin: 5px !important;\">\n",
1897
+ " <img src=\"https://33.media.tumblr.com/ec768e8a6f881fbc0f329932c8591a88/tumblr_mpqwb14Fsq1rjcfxro1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1898
+ " </figure>\n",
1899
+ " \n",
1900
+ " <figure style=\"margin: 5px !important;\">\n",
1901
+ " <img src=\"https://33.media.tumblr.com/7ada83ae354be1d83ea4407fea789ab8/tumblr_na0e6razjV1s71nvbo1_250.gif\" style=\"width: 120px; height: 90px\" >\n",
1902
+ " </figure>\n",
1903
+ " \n",
1904
+ " <figure style=\"margin: 5px !important;\">\n",
1905
+ " <img src=\"https://38.media.tumblr.com/f8b6d3d79b59462019c2daf2ba8b4148/tumblr_np762bxBYV1t7jda2o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1906
+ " </figure>\n",
1907
+ " \n",
1908
+ " <figure style=\"margin: 5px !important;\">\n",
1909
+ " <img src=\"https://38.media.tumblr.com/a5ae79c2d62c592d7565684a72af8f2c/tumblr_nageslBNqC1tstoffo1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1910
+ " </figure>\n",
1911
+ " \n",
1912
+ " <figure style=\"margin: 5px !important;\">\n",
1913
+ " <img src=\"https://38.media.tumblr.com/3e1f37fea789bb1508d40e8c30f791ae/tumblr_na3xfcUdnK1tiamx1o1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1914
+ " </figure>\n",
1915
+ " \n",
1916
+ " <figure style=\"margin: 5px !important;\">\n",
1917
+ " <img src=\"https://33.media.tumblr.com/aec9cbbdf826f98307e6d5f3d544a4c2/tumblr_mmlrbhGDAO1qaqutao1_500.gif\" style=\"width: 120px; height: 90px\" >\n",
1918
+ " </figure>\n",
1919
+ " \n",
1920
+ " <figure style=\"margin: 5px !important;\">\n",
1921
+ " <img src=\"https://38.media.tumblr.com/0b04187cb51a8889b0f41e5fbe390df2/tumblr_nbcltiDvcq1s7ri4yo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1922
+ " </figure>\n",
1923
+ " \n",
1924
+ " <figure style=\"margin: 5px !important;\">\n",
1925
+ " <img src=\"https://33.media.tumblr.com/14f9b213a7355096c14b0af3a7768f5d/tumblr_npexfuFU2K1ti77bgo1_400.gif\" style=\"width: 120px; height: 90px\" >\n",
1926
+ " </figure>\n",
1927
+ " \n",
1928
+ " </div>\n",
1929
+ " "
1930
+ ],
1931
+ "text/plain": [
1932
+ "<IPython.core.display.HTML object>"
1933
+ ]
1934
+ },
1935
+ "execution_count": 26,
1936
+ "metadata": {},
1937
+ "output_type": "execute_result"
1938
+ }
1939
+ ],
1940
+ "source": [
1941
+ "gifs = search_gif(\"a fluffy dog being cute and dancing like a person\")\n",
1942
+ "display_gif(gifs)"
1943
+ ]
1944
+ },
1945
+ {
1946
+ "cell_type": "markdown",
1947
+ "metadata": {},
1948
+ "source": [
1949
+ "These look like pretty good results.\n",
1950
+ "\n",
1951
+ "---"
1952
+ ]
1953
+ }
1954
+ ],
1955
+ "metadata": {
1956
+ "accelerator": "GPU",
1957
+ "colab": {
1958
+ "collapsed_sections": [],
1959
+ "name": "gif_search.ipynb",
1960
+ "provenance": []
1961
+ },
1962
+ "environment": {
1963
+ "kernel": "python3",
1964
+ "name": "common-cu110.m91",
1965
+ "type": "gcloud",
1966
+ "uri": "gcr.io/deeplearning-platform-release/base-cu110:m91"
1967
+ },
1968
+ "interpreter": {
1969
+ "hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce"
1970
+ },
1971
+ "kernelspec": {
1972
+ "display_name": "Python 3",
1973
+ "language": "python",
1974
+ "name": "python3"
1975
+ },
1976
+ "language_info": {
1977
+ "codemirror_mode": {
1978
+ "name": "ipython",
1979
+ "version": 3
1980
+ },
1981
+ "file_extension": ".py",
1982
+ "mimetype": "text/x-python",
1983
+ "name": "python",
1984
+ "nbconvert_exporter": "python",
1985
+ "pygments_lexer": "ipython3",
1986
+ "version": "3.7.12"
1987
+ },
1988
+ "widgets": {}
1989
+ },
1990
+ "nbformat": 4,
1991
+ "nbformat_minor": 4
1992
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ pinecone-client
4
+ click==8.0