jamescalam commited on
Commit
90ae924
1 Parent(s): d2b37ff

first version

Browse files
Files changed (3) hide show
  1. app.py +52 -0
  2. gif-search-indexer.ipynb +1992 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
6
+
7
+ @st.experimental_singleton(show_spinner=False)
8
+ def init_pinecone():
9
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
10
+ return pinecone.Index('gif-search')
11
+
12
+ @st.experimental_singleton(show_spinner=False)
13
+ def init_retriever():
14
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
15
+
16
+ with st.spinner("Retrieving the Retriever..."):
17
+ retriever = init_retriever()
18
+
19
+ with st.spinner("Connecting to Pinecone..."):
20
+ index = init_pinecone()
21
+
22
+ def card(urls):
23
+ figures = [f"""
24
+ <figure style="margin-top: 5px; margin-bottom: 5px; !important;">
25
+ <img src="{url}" style="width: 130px; height: 100px; padding-left: 5px; padding-right: 5px" >
26
+ </figure>
27
+ """ for url in urls]
28
+ return st.markdown(f"""
29
+ <div style="display: flex; flex-flow: row wrap; text-align: center; justify-content: center;">
30
+ {''.join(figures)}
31
+ </div>
32
+ """, unsafe_allow_html=True)
33
+
34
+ st.write("""
35
+ ## ⚡️ AI-Powered GIF Search ⚡️
36
+
37
+ Search for GIFs using Semantic Search, learn how it works [here](https://www.pinecone.io/learn/gif-search/).
38
+ """)
39
+
40
+ query = st.text_input("What are you looking for?", "")
41
+
42
+ if query != "":
43
+ with st.spinner(text="Similarity Searching..."):
44
+ xq = retriever.encode([query]).tolist()
45
+ xc = index.query(xq, top_k=30, include_metadata=True)
46
+
47
+ urls = []
48
+ for context in xc['matches']:
49
+ urls.append(context['metadata']['url'])
50
+
51
+ with st.spinner(text="Fetching GIFs 🚀🚀🚀"):
52
+ card(urls)
gif-search-indexer.ipynb ADDED
@@ -0,0 +1,1992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# NLP Powered GIF search"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "We will use the [Tumblr GIF Description Dataset](http://raingo.github.io/TGIF-Release/), which contains over 100k animated GIFs and 120K sentences describing its visual content. Using this data with a *vector database* and *retriever* we are able to create an NLP-powered GIF search tool.\n",
15
+ "\n",
16
+ "There are a few packages that must be installed for this notebook to run:"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "pip install -U pandas pinecone-client sentence-transformers tqdm"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {},
31
+ "source": [
32
+ "We must also set the following notebook parameters to display the GIF images we will be working with."
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 1,
38
+ "metadata": {
39
+ "id": "r8KN-iWWdwby"
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "from IPython.display import HTML\n",
44
+ "from IPython.core.interactiveshell import InteractiveShell\n",
45
+ "InteractiveShell.ast_node_interactivity = \"all\""
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "markdown",
50
+ "metadata": {
51
+ "id": "KFIZrga-6Jq_"
52
+ },
53
+ "source": [
54
+ "## Download and Extract Dataset"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "metadata": {},
60
+ "source": [
61
+ "First let's download and extract the dataset. The dataset is available [here](https://github.com/raingo/TGIF-Release) on GitHub. We can use the link below to download the dataset directly. We can also access the link from a browser to directly download the files."
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 2,
67
+ "metadata": {
68
+ "colab": {
69
+ "base_uri": "https://localhost:8080/"
70
+ },
71
+ "id": "ZD4gusO9YB1-",
72
+ "outputId": "2d69fa61-f67a-45c8-ecc4-4d1c9b06f7cf"
73
+ },
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "--2022-08-13 16:40:35-- https://github.com/raingo/TGIF-Release/archive/master.zip\n",
80
+ "Resolving github.com (github.com)... 140.82.114.4\n",
81
+ "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n",
82
+ "HTTP request sent, awaiting response... 302 Found\n",
83
+ "Location: https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master [following]\n",
84
+ "--2022-08-13 16:40:35-- https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master\n",
85
+ "Resolving codeload.github.com (codeload.github.com)... 140.82.114.10\n",
86
+ "Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.\n",
87
+ "HTTP request sent, awaiting response... 200 OK\n",
88
+ "Length: unspecified [application/zip]\n",
89
+ "Saving to: ‘master.zip’\n",
90
+ "\n",
91
+ "master.zip [ <=> ] 11.82M 6.59MB/s in 1.8s \n",
92
+ "\n",
93
+ "2022-08-13 16:40:37 (6.59 MB/s) - ‘master.zip’ saved [12396861]\n",
94
+ "\n"
95
+ ]
96
+ }
97
+ ],
98
+ "source": [
99
+ "# Use wget to download the master.zip file which contains the dataset\n",
100
+ "!wget https://github.com/raingo/TGIF-Release/archive/master.zip"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 3,
106
+ "metadata": {
107
+ "id": "qLvXp0RtYTTz"
108
+ },
109
+ "outputs": [
110
+ {
111
+ "name": "stdout",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "Archive: master.zip\n",
115
+ "3e54d2f71418d8a2e9f5f61aa5be0edb9c0ac2b8\n",
116
+ " creating: TGIF-Release-master/\n",
117
+ " inflating: TGIF-Release-master/.gitignore \n",
118
+ " inflating: TGIF-Release-master/.gitmodules \n",
119
+ " inflating: TGIF-Release-master/LICENSE \n",
120
+ " inflating: TGIF-Release-master/README.md \n",
121
+ " creating: TGIF-Release-master/code/\n",
122
+ " inflating: TGIF-Release-master/code/README.md \n",
123
+ " creating: TGIF-Release-master/code/crowdflower/\n",
124
+ " extracting: TGIF-Release-master/code/crowdflower/.gitignore \n",
125
+ " inflating: TGIF-Release-master/code/crowdflower/.gitmodules \n",
126
+ " inflating: TGIF-Release-master/code/crowdflower/README.md \n",
127
+ " creating: TGIF-Release-master/code/crowdflower/back-end/\n",
128
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/.gitignore \n",
129
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/deploy.sh \n",
130
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/entity_extract.py \n",
131
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/eval.py \n",
132
+ " creating: TGIF-Release-master/code/crowdflower/back-end/logs/\n",
133
+ " extracting: TGIF-Release-master/code/crowdflower/back-end/logs/.gitignore \n",
134
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/logs/logging.conf \n",
135
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/requirements.txt \n",
136
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/routes.py \n",
137
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/start_nlp_sever.sh \n",
138
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/swear-words \n",
139
+ " inflating: TGIF-Release-master/code/crowdflower/back-end/test-data.sorted \n",
140
+ " creating: TGIF-Release-master/code/crowdflower/front-end/\n",
141
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/.gitignore \n",
142
+ " creating: TGIF-Release-master/code/crowdflower/front-end/data/\n",
143
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/data/.gitignore \n",
144
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/gen_test_cases.py \n",
145
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/notify.py \n",
146
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/parse-res.py \n",
147
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/pipeline.py \n",
148
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-judgments.py \n",
149
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-rest.py \n",
150
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/review-test.py \n",
151
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/set-diff.py \n",
152
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/shuffle-test.py \n",
153
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/data/update-test.py \n",
154
+ " creating: TGIF-Release-master/code/crowdflower/front-end/layout/\n",
155
+ " extracting: TGIF-Release-master/code/crowdflower/front-end/layout/.gitignore \n",
156
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/forgive.js \n",
157
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/instructions.md \n",
158
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/main.html \n",
159
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/main.js \n",
160
+ " inflating: TGIF-Release-master/code/crowdflower/front-end/layout/view.css \n",
161
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/\n",
162
+ " extracting: TGIF-Release-master/code/crowdflower/table3-rating/.gitignore \n",
163
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/requirements.txt \n",
164
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/routes.py \n",
165
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/static/\n",
166
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/static/view.css \n",
167
+ " creating: TGIF-Release-master/code/crowdflower/table3-rating/templates/\n",
168
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/templates/_formhelper.html \n",
169
+ " inflating: TGIF-Release-master/code/crowdflower/table3-rating/templates/submit.html \n",
170
+ " creating: TGIF-Release-master/code/gif2txt-lstm/\n",
171
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/README.md \n",
172
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/caffe-rnn.patch \n",
173
+ " creating: TGIF-Release-master/code/gif2txt-lstm/models/\n",
174
+ " inflating: TGIF-Release-master/code/gif2txt-lstm/models/README.md \n",
175
+ " creating: TGIF-Release-master/code/gifs-filter/\n",
176
+ " extracting: TGIF-Release-master/code/gifs-filter/.gitignore \n",
177
+ " inflating: TGIF-Release-master/code/gifs-filter/.gitmodules \n",
178
+ " inflating: TGIF-Release-master/code/gifs-filter/README.md \n",
179
+ " creating: TGIF-Release-master/code/gifs-filter/adult-filter/\n",
180
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/README.md \n",
181
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/filter.sh \n",
182
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/gen-raw.sh \n",
183
+ " creating: TGIF-Release-master/code/gifs-filter/adult-filter/keywords/\n",
184
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/keywords/README.md \n",
185
+ " inflating: TGIF-Release-master/code/gifs-filter/adult-filter/parse-tsv.py \n",
186
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/\n",
187
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/README.md \n",
188
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/cluster-by-tags.py \n",
189
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/dump-tags.py \n",
190
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter-images.py \n",
191
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter-text.py \n",
192
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/filter_tags.py \n",
193
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/giftypes/\n",
194
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/giftypes/c3d-models-rfc.pkl \n",
195
+ " creating: TGIF-Release-master/code/gifs-filter/c3d-models/no-motion/\n",
196
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/no-motion/c3d-models-rfc.pkl \n",
197
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/predict.py \n",
198
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/rank_tags.py \n",
199
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/setdiff.py \n",
200
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/setinter.py \n",
201
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/tag_rules \n",
202
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d-models/train.py \n",
203
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d.patch \n",
204
+ " creating: TGIF-Release-master/code/gifs-filter/c3d/\n",
205
+ " extracting: TGIF-Release-master/code/gifs-filter/c3d/.gitignore \n",
206
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/README.md \n",
207
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/agg_feat.py \n",
208
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/build_deploy.py \n",
209
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/c3d.sh \n",
210
+ " inflating: TGIF-Release-master/code/gifs-filter/c3d/deploy.prototxt.in \n",
211
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/\n",
212
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/.gitignore \n",
213
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/README.md \n",
214
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/agg-hash.py \n",
215
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/build.sh \n",
216
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/cluster-pairs.py \n",
217
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dedup-v2.sh \n",
218
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dedup.sh \n",
219
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/dump-nd.py \n",
220
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/extract_hash.sh \n",
221
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/extract_mhhash.cpp \n",
222
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/filter-cluster.py \n",
223
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/match-hash.sh \n",
224
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/mih/\n",
225
+ " extracting: TGIF-Release-master/code/gifs-filter/dedup/mih/README.md \n",
226
+ " creating: TGIF-Release-master/code/gifs-filter/dedup/pHash-0.9.6/\n",
227
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/pHash-0.9.6/README.md \n",
228
+ " inflating: TGIF-Release-master/code/gifs-filter/dedup/store-hash.sh \n",
229
+ " inflating: TGIF-Release-master/code/gifs-filter/email_notify.py \n",
230
+ " inflating: TGIF-Release-master/code/gifs-filter/full.sh \n",
231
+ " inflating: TGIF-Release-master/code/gifs-filter/gen_set.py \n",
232
+ " inflating: TGIF-Release-master/code/gifs-filter/monitor-api.sh \n",
233
+ " inflating: TGIF-Release-master/code/gifs-filter/monitor.sh \n",
234
+ " inflating: TGIF-Release-master/code/gifs-filter/pipeline.sh \n",
235
+ " inflating: TGIF-Release-master/code/gifs-filter/prepare-data.sh \n",
236
+ " inflating: TGIF-Release-master/code/gifs-filter/requirements.txt \n",
237
+ " inflating: TGIF-Release-master/code/gifs-filter/review-CF.py \n",
238
+ " inflating: TGIF-Release-master/code/gifs-filter/split-batches.sh \n",
239
+ " creating: TGIF-Release-master/code/gifs-filter/test/\n",
240
+ " extracting: TGIF-Release-master/code/gifs-filter/test/.gitignore \n",
241
+ " inflating: TGIF-Release-master/code/gifs-filter/test/gif.urls \n",
242
+ " creating: TGIF-Release-master/code/gifs-filter/text-score/\n",
243
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/.gitignore \n",
244
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/Makefile \n",
245
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/README.md \n",
246
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/debug.sh \n",
247
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/filter.hpp \n",
248
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/group_area.hpp \n",
249
+ " creating: TGIF-Release-master/code/gifs-filter/text-score/test/\n",
250
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/README.md \n",
251
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/benchmark.sh \n",
252
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/neg.urls \n",
253
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/test/pos.urls \n",
254
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/text-score.cpp \n",
255
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/text-score.sh \n",
256
+ " inflating: TGIF-Release-master/code/gifs-filter/text-score/textdetection.cpp \n",
257
+ " creating: TGIF-Release-master/data/\n",
258
+ " extracting: TGIF-Release-master/data/.gitignore \n",
259
+ " creating: TGIF-Release-master/data/GIF2Movie/\n",
260
+ " inflating: TGIF-Release-master/data/GIF2Movie/M-VAD.tsv \n",
261
+ " inflating: TGIF-Release-master/data/GIF2Movie/MPII-MD.tsv \n",
262
+ " inflating: TGIF-Release-master/data/README.md \n",
263
+ " creating: TGIF-Release-master/data/coco-caption/\n",
264
+ " inflating: TGIF-Release-master/data/eval.py \n",
265
+ " inflating: TGIF-Release-master/data/results-lstm-cnn-finetune-cvpr16.tsv \n",
266
+ " creating: TGIF-Release-master/data/splits/\n",
267
+ " extracting: TGIF-Release-master/data/splits/.gitignore \n",
268
+ " inflating: TGIF-Release-master/data/splits/test.txt \n",
269
+ " inflating: TGIF-Release-master/data/splits/train.txt \n",
270
+ " inflating: TGIF-Release-master/data/splits/val.txt \n",
271
+ " inflating: TGIF-Release-master/data/tgif-v1.0.tsv \n",
272
+ " creating: TGIF-Release-master/docs/\n",
273
+ " creating: TGIF-Release-master/docs/_includes/\n",
274
+ " inflating: TGIF-Release-master/docs/_includes/authors.html \n",
275
+ " inflating: TGIF-Release-master/docs/_includes/download.html \n",
276
+ " inflating: TGIF-Release-master/docs/_includes/examples.html \n",
277
+ " inflating: TGIF-Release-master/docs/_includes/footer.html \n",
278
+ " inflating: TGIF-Release-master/docs/_includes/head.html \n",
279
+ " inflating: TGIF-Release-master/docs/_includes/header.html \n",
280
+ " inflating: TGIF-Release-master/docs/_includes/nav.html \n",
281
+ " inflating: TGIF-Release-master/docs/_includes/overview.html \n",
282
+ " creating: TGIF-Release-master/docs/_layouts/\n",
283
+ " inflating: TGIF-Release-master/docs/_layouts/default.html \n",
284
+ " creating: TGIF-Release-master/docs/css/\n",
285
+ " inflating: TGIF-Release-master/docs/css/main.scss \n",
286
+ " extracting: TGIF-Release-master/docs/index.html \n",
287
+ " creating: TGIF-Release-master/docs/js/\n",
288
+ " inflating: TGIF-Release-master/docs/js/main.js \n"
289
+ ]
290
+ }
291
+ ],
292
+ "source": [
293
+ "# Use unzip to extract the master.zip file\n",
294
+ "!unzip master.zip"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "metadata": {
300
+ "id": "7agJKFkZ6UGB"
301
+ },
302
+ "source": [
303
+ "## Explore the Dataset"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "metadata": {},
309
+ "source": [
310
+ "Now let's explore the downloaded files. The data we want is in *tgif-v1.0.tsv* file in the *data* folder. We can use *pandas* library to open the file. We need to set delimiter as `\\t` as the file contains tab separated values."
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 4,
316
+ "metadata": {
317
+ "id": "1rwBQ3I2Ye7c"
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "import pandas as pd"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": 5,
327
+ "metadata": {
328
+ "id": "K8RvBSYSbvUb"
329
+ },
330
+ "outputs": [
331
+ {
332
+ "data": {
333
+ "text/html": [
334
+ "<div>\n",
335
+ "<style scoped>\n",
336
+ " .dataframe tbody tr th:only-of-type {\n",
337
+ " vertical-align: middle;\n",
338
+ " }\n",
339
+ "\n",
340
+ " .dataframe tbody tr th {\n",
341
+ " vertical-align: top;\n",
342
+ " }\n",
343
+ "\n",
344
+ " .dataframe thead th {\n",
345
+ " text-align: right;\n",
346
+ " }\n",
347
+ "</style>\n",
348
+ "<table border=\"1\" class=\"dataframe\">\n",
349
+ " <thead>\n",
350
+ " <tr style=\"text-align: right;\">\n",
351
+ " <th></th>\n",
352
+ " <th>url</th>\n",
353
+ " <th>description</th>\n",
354
+ " </tr>\n",
355
+ " </thead>\n",
356
+ " <tbody>\n",
357
+ " <tr>\n",
358
+ " <th>0</th>\n",
359
+ " <td>https://38.media.tumblr.com/9f6c25cc350f12aa74...</td>\n",
360
+ " <td>a man is glaring, and someone with sunglasses ...</td>\n",
361
+ " </tr>\n",
362
+ " <tr>\n",
363
+ " <th>1</th>\n",
364
+ " <td>https://38.media.tumblr.com/9ead028ef62004ef6a...</td>\n",
365
+ " <td>a cat tries to catch a mouse on a tablet</td>\n",
366
+ " </tr>\n",
367
+ " <tr>\n",
368
+ " <th>2</th>\n",
369
+ " <td>https://38.media.tumblr.com/9f43dc410be85b1159...</td>\n",
370
+ " <td>a man dressed in red is dancing.</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>3</th>\n",
374
+ " <td>https://38.media.tumblr.com/9f659499c8754e40cf...</td>\n",
375
+ " <td>an animal comes close to another in the jungle</td>\n",
376
+ " </tr>\n",
377
+ " <tr>\n",
378
+ " <th>4</th>\n",
379
+ " <td>https://38.media.tumblr.com/9ed1c99afa7d714118...</td>\n",
380
+ " <td>a man in a hat adjusts his tie and makes a wei...</td>\n",
381
+ " </tr>\n",
382
+ " </tbody>\n",
383
+ "</table>\n",
384
+ "</div>"
385
+ ],
386
+ "text/plain": [
387
+ " url \\\n",
388
+ "0 https://38.media.tumblr.com/9f6c25cc350f12aa74... \n",
389
+ "1 https://38.media.tumblr.com/9ead028ef62004ef6a... \n",
390
+ "2 https://38.media.tumblr.com/9f43dc410be85b1159... \n",
391
+ "3 https://38.media.tumblr.com/9f659499c8754e40cf... \n",
392
+ "4 https://38.media.tumblr.com/9ed1c99afa7d714118... \n",
393
+ "\n",
394
+ " description \n",
395
+ "0 a man is glaring, and someone with sunglasses ... \n",
396
+ "1 a cat tries to catch a mouse on a tablet \n",
397
+ "2 a man dressed in red is dancing. \n",
398
+ "3 an animal comes close to another in the jungle \n",
399
+ "4 a man in a hat adjusts his tie and makes a wei... "
400
+ ]
401
+ },
402
+ "execution_count": 5,
403
+ "metadata": {},
404
+ "output_type": "execute_result"
405
+ }
406
+ ],
407
+ "source": [
408
+ "# Load dataset to a pandas dataframe\n",
409
+ "df = pd.read_csv(\n",
410
+ " \"./TGIF-Release-master/data/tgif-v1.0.tsv\",\n",
411
+ " delimiter=\"\\t\",\n",
412
+ " names=['url', 'description']\n",
413
+ ")\n",
414
+ "df.head()"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "markdown",
419
+ "metadata": {},
420
+ "source": [
421
+ "*Note the dataset does not contain the actual GIF files. But it has URLs we can use to download/access the GIF files. This is great as we do not need to store/download all the GIF files. We can directly load the required GIF files using the URL when displaying the search results.*\n",
422
+ "\n",
423
+ "There are some duplicate descriptions in the dataset."
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 6,
429
+ "metadata": {},
430
+ "outputs": [
431
+ {
432
+ "data": {
433
+ "text/plain": [
434
+ "125782"
435
+ ]
436
+ },
437
+ "execution_count": 6,
438
+ "metadata": {},
439
+ "output_type": "execute_result"
440
+ }
441
+ ],
442
+ "source": [
443
+ "len(df)"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 7,
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/plain": [
454
+ "102068"
455
+ ]
456
+ },
457
+ "execution_count": 7,
458
+ "metadata": {},
459
+ "output_type": "execute_result"
460
+ }
461
+ ],
462
+ "source": [
463
+ "# Number of *unique* GIFs in the dataset\n",
464
+ "len(df[\"url\"].unique())"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 8,
470
+ "metadata": {},
471
+ "outputs": [
472
+ {
473
+ "data": {
474
+ "text/plain": [
475
+ "https://38.media.tumblr.com/ddbfe51aff57fd8446f49546bc027bd7/tumblr_nowv0v6oWj1uwbrato1_500.gif 4\n",
476
+ "https://33.media.tumblr.com/46c873a60bb8bd97bdc253b826d1d7a1/tumblr_nh7vnlXEvL1u6fg3no1_500.gif 4\n",
477
+ "https://38.media.tumblr.com/b544f3c87cbf26462dc267740bb1c842/tumblr_n98uooxl0K1thiyb6o1_250.gif 4\n",
478
+ "https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif 4\n",
479
+ "https://31.media.tumblr.com/69bca8520e1f03b4148dde2ac78469ec/tumblr_npvi0kW4OD1urqm0mo1_400.gif 4\n",
480
+ "Name: url, dtype: int64"
481
+ ]
482
+ },
483
+ "execution_count": 8,
484
+ "metadata": {},
485
+ "output_type": "execute_result"
486
+ }
487
+ ],
488
+ "source": [
489
+ "dupes = df['url'].value_counts().sort_values(ascending=False)\n",
490
+ "dupes.head()"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "markdown",
495
+ "metadata": {},
496
+ "source": [
497
+ "Let's take a look at one of these duplicated URLs and it's descriptions."
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "code",
502
+ "execution_count": 9,
503
+ "metadata": {},
504
+ "outputs": [
505
+ {
506
+ "data": {
507
+ "text/html": [
508
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
509
+ ],
510
+ "text/plain": [
511
+ "<IPython.core.display.HTML object>"
512
+ ]
513
+ },
514
+ "execution_count": 9,
515
+ "metadata": {},
516
+ "output_type": "execute_result"
517
+ },
518
+ {
519
+ "name": "stdout",
520
+ "output_type": "stream",
521
+ "text": [
522
+ "two girls are singing music pop in a concert\n"
523
+ ]
524
+ },
525
+ {
526
+ "data": {
527
+ "text/html": [
528
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
529
+ ],
530
+ "text/plain": [
531
+ "<IPython.core.display.HTML object>"
532
+ ]
533
+ },
534
+ "execution_count": 9,
535
+ "metadata": {},
536
+ "output_type": "execute_result"
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "a woman sings sang girl on a stage singing\n"
543
+ ]
544
+ },
545
+ {
546
+ "data": {
547
+ "text/html": [
548
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
549
+ ],
550
+ "text/plain": [
551
+ "<IPython.core.display.HTML object>"
552
+ ]
553
+ },
554
+ "execution_count": 9,
555
+ "metadata": {},
556
+ "output_type": "execute_result"
557
+ },
558
+ {
559
+ "name": "stdout",
560
+ "output_type": "stream",
561
+ "text": [
562
+ "two girls on a stage sing into microphones.\n"
563
+ ]
564
+ },
565
+ {
566
+ "data": {
567
+ "text/html": [
568
+ "<img src=https://33.media.tumblr.com/88235b43b48e9823eeb3e7890f3d46ef/tumblr_nkg5leY4e21sof15vo1_500.gif style='width:120px; height:90px'>"
569
+ ],
570
+ "text/plain": [