Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,764 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
st.set_page_config(page_title="Memory and Mirroring", page_icon=":brain:", layout="wide")
|
3 |
+
|
4 |
+
hide_streamlit_style = """
|
5 |
+
<style>
|
6 |
+
#MainMenu {visibility: hidden;}
|
7 |
+
footer {visibility: hidden;}
|
8 |
+
</style>
|
9 |
+
"""
|
10 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
11 |
+
|
12 |
+
st.title(":brain: Memory and Mirroring")
|
13 |
+
|
14 |
+
with st.expander(":memo: Semantic and Episodic Memory"):
|
15 |
+
st.subheader(":one: Semantic Memory")
|
16 |
+
st.markdown("**Semantic memory** is a type of long-term memory that stores facts, concepts, and knowledge about the world. It's responsible for our general knowledge and understanding.")
|
17 |
+
|
18 |
+
st.subheader(":two: Episodic Memory")
|
19 |
+
st.markdown("**Episodic memory** is another type of long-term memory that stores personal experiences and events, including their temporal and spatial contexts.")
|
20 |
+
|
21 |
+
with st.expander(":robot: Mirroring in Behavioral Health"):
|
22 |
+
st.subheader(":one: What is Mirroring?")
|
23 |
+
st.markdown("**Mirroring** is a technique used in behavioral health where a person subtly imitates the gestures, speech patterns, or attitudes of another to build rapport and understanding.")
|
24 |
+
|
25 |
+
st.subheader(":two: Benefits of Mirroring")
|
26 |
+
st.markdown("Mirroring can help improve communication, empathy, and trust between individuals, making it a valuable tool in therapy and coaching.")
|
27 |
+
|
28 |
+
st.subheader(":three: Mirroring vs. Mimicry")
|
29 |
+
st.markdown("While mirroring is a subtle and respectful way of connecting with someone, **mimicry** is an exaggerated form of imitation that can come off as mocking or insincere.")
|
30 |
+
|
31 |
+
|
32 |
+
hide_streamlit_style = """
|
33 |
+
<style>
|
34 |
+
#MainMenu {visibility: hidden;}
|
35 |
+
footer {visibility: hidden;}
|
36 |
+
</style>
|
37 |
+
"""
|
38 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
39 |
+
|
40 |
+
st.title(":brain: Memory and Mirroring")
|
41 |
+
|
42 |
+
with st.expander(":memo: Semantic and Episodic Memory"):
|
43 |
+
st.subheader(":one: Semantic Memory")
|
44 |
+
st.markdown("**Semantic memory** is a type of long-term memory that stores facts, concepts, and knowledge about the world. It's responsible for our general knowledge and understanding.")
|
45 |
+
|
46 |
+
st.subheader(":two: Episodic Memory")
|
47 |
+
st.markdown("**Episodic memory** is another type of long-term memory that stores personal experiences and events, including their temporal and spatial contexts.")
|
48 |
+
|
49 |
+
with st.expander(":robot: Mirroring in Behavioral Health"):
|
50 |
+
st.subheader(":one: What is Mirroring?")
|
51 |
+
st.markdown("**Mirroring** is a technique used in behavioral health where a person subtly imitates the gestures, speech patterns, or attitudes of another to build rapport and understanding.")
|
52 |
+
|
53 |
+
st.subheader(":two: Benefits of Mirroring")
|
54 |
+
st.markdown("Mirroring can help improve communication, empathy, and trust between individuals, making it a valuable tool in therapy and coaching.")
|
55 |
+
|
56 |
+
st.subheader(":three: Mirroring vs. Mimicry")
|
57 |
+
st.markdown("While mirroring is a subtle and respectful way of connecting with someone, **mimicry** is an exaggerated form of imitation that can come off as mocking or insincere.")
|
58 |
+
|
59 |
+
st.sidebar.title(":guardsman: Rules")
|
60 |
+
|
61 |
+
st.sidebar.markdown("""
|
62 |
+
|
63 |
+
1. **Respect** the other person's personal# 🩺🔍 Search Results
|
64 |
+
### 18 Jan 2023 | [Joint Representation Learning for Text and 3D Point Cloud](https://arxiv.org/abs/2301.07584) | [⬇️](https://arxiv.org/pdf/2301.07584)
|
65 |
+
*Rui Huang, Xuran Pan, Henry Zheng, Haojun Jiang, Zhifeng Xie, Shiji Song, Gao Huang*
|
66 |
+
|
67 |
+
Recent advancements in vision-language pre-training (e.g. CLIP) have shown
|
68 |
+
that vision models can benefit from language supervision. While many models
|
69 |
+
using language modality have achieved great success on 2D vision tasks, the
|
70 |
+
joint representation learning of 3D point cloud with text remains
|
71 |
+
under-explored due to the difficulty of 3D-Text data pair acquisition and the
|
72 |
+
irregularity of 3D data structure. In this paper, we propose a novel Text4Point
|
73 |
+
framework to construct language-guided 3D point cloud models. The key idea is
|
74 |
+
utilizing 2D images as a bridge to connect the point cloud and the language
|
75 |
+
modalities. The proposed Text4Point follows the pre-training and fine-tuning
|
76 |
+
paradigm. During the pre-training stage, we establish the correspondence of
|
77 |
+
images and point clouds based on the readily available RGB-D data and use
|
78 |
+
contrastive learning to align the image and point cloud representations.
|
79 |
+
Together with the well-aligned image and text features achieved by CLIP, the
|
80 |
+
point cloud features are implicitly aligned with the text embeddings. Further,
|
81 |
+
we propose a Text Querying Module to integrate language information into 3D
|
82 |
+
representation learning by querying text embeddings with point cloud features.
|
83 |
+
For fine-tuning, the model learns task-specific 3D representations under
|
84 |
+
informative language guidance from the label set without 2D images. Extensive
|
85 |
+
experiments demonstrate that our model shows consistent improvement on various
|
86 |
+
downstream tasks, such as point cloud semantic segmentation, instance
|
87 |
+
segmentation, and object detection. The code will be available here:
|
88 |
+
https://github.com/LeapLabTHU/Text4Point
|
89 |
+
|
90 |
+
---------------
|
91 |
+
|
92 |
+
### 14 Jul 2017 | [A Semantics-Based Measure of Emoji Similarity](https://arxiv.org/abs/1707.04653) | [⬇️](https://arxiv.org/pdf/1707.04653)
|
93 |
+
*Sanjaya Wijeratne, Lakshika Balasuriya, Amit Sheth, Derek Doran*
|
94 |
+
|
95 |
+
Emoji have grown to become one of the most important forms of communication
|
96 |
+
on the web. With its widespread use, measuring the similarity of emoji has
|
97 |
+
become an important problem for contemporary text processing since it lies at
|
98 |
+
the heart of sentiment analysis, search, and interface design tasks. This paper
|
99 |
+
presents a comprehensive analysis of the semantic similarity of emoji through
|
100 |
+
embedding models that are learned over machine-readable emoji meanings in the
|
101 |
+
EmojiNet knowledge base. Using emoji descriptions, emoji sense labels and emoji
|
102 |
+
sense definitions, and with different training corpora obtained from Twitter
|
103 |
+
and Google News, we develop and test multiple embedding models to measure emoji
|
104 |
+
similarity. To evaluate our work, we create a new dataset called EmoSim508,
|
105 |
+
which assigns human-annotated semantic similarity scores to a set of 508
|
106 |
+
carefully selected emoji pairs. After validation with EmoSim508, we present a
|
107 |
+
real-world use-case of our emoji embedding models using a sentiment analysis
|
108 |
+
task and show that our models outperform the previous best-performing emoji
|
109 |
+
embedding model on this task. The EmoSim508 dataset and our emoji embedding
|
110 |
+
models are publicly released with this paper and can be downloaded from
|
111 |
+
http://emojinet.knoesis.org/.
|
112 |
+
|
113 |
+
---------------
|
114 |
+
|
115 |
+
### 11 Mar 2021 | [Semantic MapNet: Building Allocentric Semantic Maps and Representations from Egocentric Views](https://arxiv.org/abs/2010.01191) | [⬇️](https://arxiv.org/pdf/2010.01191)
|
116 |
+
*Vincent Cartillier, Zhile Ren, Neha Jain, Stefan Lee, Irfan Essa, Dhruv Batra*
|
117 |
+
|
118 |
+
We study the task of semantic mapping - specifically, an embodied agent (a
|
119 |
+
robot or an egocentric AI assistant) is given a tour of a new environment and
|
120 |
+
asked to build an allocentric top-down semantic map ("what is where?") from
|
121 |
+
egocentric observations of an RGB-D camera with known pose (via localization
|
122 |
+
sensors). Towards this goal, we present SemanticMapNet (SMNet), which consists
|
123 |
+
of: (1) an Egocentric Visual Encoder that encodes each egocentric RGB-D frame,
|
124 |
+
(2) a Feature Projector that projects egocentric features to appropriate
|
125 |
+
locations on a floor-plan, (3) a Spatial Memory Tensor of size floor-plan
|
126 |
+
length x width x feature-dims that learns to accumulate projected egocentric
|
127 |
+
features, and (4) a Map Decoder that uses the memory tensor to produce semantic
|
128 |
+
top-down maps. SMNet combines the strengths of (known) projective camera
|
129 |
+
geometry and neural representation learning. On the task of semantic mapping in
|
130 |
+
the Matterport3D dataset, SMNet significantly outperforms competitive baselines
|
131 |
+
by 4.01-16.81% (absolute) on mean-IoU and 3.81-19.69% (absolute) on Boundary-F1
|
132 |
+
metrics. Moreover, we show how to use the neural episodic memories and
|
133 |
+
spatio-semantic allocentric representations build by SMNet for subsequent tasks
|
134 |
+
in the same space - navigating to objects seen during the tour("Find chair") or
|
135 |
+
answering questions about the space ("How many chairs did you see in the
|
136 |
+
house?"). Project page: https://vincentcartillier.github.io/smnet.html.
|
137 |
+
|
138 |
+
---------------
|
139 |
+
|
140 |
+
### 06 Mar 2020 | [Distributional semantic modeling: a revised technique to train term/word vector space models applying the ontology-related approach](https://arxiv.org/abs/2003.03350) | [⬇️](https://arxiv.org/pdf/2003.03350)
|
141 |
+
*Oleksandr Palagin, Vitalii Velychko, Kyrylo Malakhov and Oleksandr Shchurov*
|
142 |
+
|
143 |
+
We design a new technique for the distributional semantic modeling with a
|
144 |
+
neural network-based approach to learn distributed term representations (or
|
145 |
+
term embeddings) - term vector space models as a result, inspired by the recent
|
146 |
+
ontology-related approach (using different types of contextual knowledge such
|
147 |
+
as syntactic knowledge, terminological knowledge, semantic knowledge, etc.) to
|
148 |
+
the identification of terms (term extraction) and relations between them
|
149 |
+
(relation extraction) called semantic pre-processing technology - SPT. Our
|
150 |
+
method relies on automatic term extraction from the natural language texts and
|
151 |
+
subsequent formation of the problem-oriented or application-oriented (also
|
152 |
+
deeply annotated) text corpora where the fundamental entity is the term
|
153 |
+
(includes non-compositional and compositional terms). This gives us an
|
154 |
+
opportunity to changeover from distributed word representations (or word
|
155 |
+
embeddings) to distributed term representations (or term embeddings). This
|
156 |
+
transition will allow to generate more accurate semantic maps of different
|
157 |
+
subject domains (also, of relations between input terms - it is useful to
|
158 |
+
explore clusters and oppositions, or to test your hypotheses about them). The
|
159 |
+
semantic map can be represented as a graph using Vec2graph - a Python library
|
160 |
+
for visualizing word embeddings (term embeddings in our case) as dynamic and
|
161 |
+
interactive graphs. The Vec2graph library coupled with term embeddings will not
|
162 |
+
only improve accuracy in solving standard NLP tasks, but also update the
|
163 |
+
conventional concept of automated ontology development. The main practical
|
164 |
+
result of our work is the development kit (set of toolkits represented as web
|
165 |
+
service APIs and web application), which provides all necessary routines for
|
166 |
+
the basic linguistic pre-processing and the semantic pre-processing of the
|
167 |
+
natural language texts in Ukrainian for future training of term vector space
|
168 |
+
models.
|
169 |
+
|
170 |
+
---------------
|
171 |
+
|
172 |
+
### 23 Jan 2023 | [Lexi: Self-Supervised Learning of the UI Language](https://arxiv.org/abs/2301.10165) | [⬇️](https://arxiv.org/pdf/2301.10165)
|
173 |
+
*Pratyay Banerjee, Shweti Mahajan, Kushal Arora, Chitta Baral, Oriana Riva*
|
174 |
+
|
175 |
+
Humans can learn to operate the user interface (UI) of an application by
|
176 |
+
reading an instruction manual or how-to guide. Along with text, these resources
|
177 |
+
include visual content such as UI screenshots and images of application icons
|
178 |
+
referenced in the text. We explore how to leverage this data to learn generic
|
179 |
+
visio-linguistic representations of UI screens and their components. These
|
180 |
+
representations are useful in many real applications, such as accessibility,
|
181 |
+
voice navigation, and task automation. Prior UI representation models rely on
|
182 |
+
UI metadata (UI trees and accessibility labels), which is often missing,
|
183 |
+
incompletely defined, or not accessible. We avoid such a dependency, and
|
184 |
+
propose Lexi, a pre-trained vision and language model designed to handle the
|
185 |
+
unique features of UI screens, including their text richness and context
|
186 |
+
sensitivity. To train Lexi we curate the UICaption dataset consisting of 114k
|
187 |
+
UI images paired with descriptions of their functionality. We evaluate Lexi on
|
188 |
+
four tasks: UI action entailment, instruction-based UI image retrieval,
|
189 |
+
grounding referring expressions, and UI entity recognition.
|
190 |
+
|
191 |
+
---------------
|
192 |
+
|
193 |
+
### 19 Feb 2024 | [ScreenAI: A Vision-Language Model for UI and Infographics Understanding](https://arxiv.org/abs/2402.04615) | [⬇️](https://arxiv.org/pdf/2402.04615)
|
194 |
+
*Gilles Baechler, Srinivas Sunkara, Maria Wang, Fedir Zubach, Hassan Mansoor, Vincent Etter, Victor C\u{a}rbune, Jason Lin, Jindong Chen, Abhanshu Sharma*
|
195 |
+
|
196 |
+
Screen user interfaces (UIs) and infographics, sharing similar visual
|
197 |
+
language and design principles, play important roles in human communication and
|
198 |
+
human-machine interaction. We introduce ScreenAI, a vision-language model that
|
199 |
+
specializes in UI and infographics understanding. Our model improves upon the
|
200 |
+
PaLI architecture with the flexible patching strategy of pix2struct and is
|
201 |
+
trained on a unique mixture of datasets. At the heart of this mixture is a
|
202 |
+
novel screen annotation task in which the model has to identify the type and
|
203 |
+
location of UI elements. We use these text annotations to describe screens to
|
204 |
+
Large Language Models and automatically generate question-answering (QA), UI
|
205 |
+
navigation, and summarization training datasets at scale. We run ablation
|
206 |
+
studies to demonstrate the impact of these design choices. At only 5B
|
207 |
+
parameters, ScreenAI achieves new state-of-the-artresults on UI- and
|
208 |
+
infographics-based tasks (Multi-page DocVQA, WebSRC, MoTIF and Widget
|
209 |
+
Captioning), and new best-in-class performance on others (Chart QA, DocVQA, and
|
210 |
+
InfographicVQA) compared to models of similar size. Finally, we release three
|
211 |
+
new datasets: one focused on the screen annotation task and two others focused
|
212 |
+
on question answering.
|
213 |
+
|
214 |
+
---------------
|
215 |
+
|
216 |
+
### 14 Jul 2017 | [EmojiNet: An Open Service and API for Emoji Sense Discovery](https://arxiv.org/abs/1707.04652) | [⬇️](https://arxiv.org/pdf/1707.04652)
|
217 |
+
*Sanjaya Wijeratne, Lakshika Balasuriya, Amit Sheth, Derek Doran*
|
218 |
+
|
219 |
+
This paper presents the release of EmojiNet, the largest machine-readable
|
220 |
+
emoji sense inventory that links Unicode emoji representations to their English
|
221 |
+
meanings extracted from the Web. EmojiNet is a dataset consisting of: (i)
|
222 |
+
12,904 sense labels over 2,389 emoji, which were extracted from the web and
|
223 |
+
linked to machine-readable sense definitions seen in BabelNet, (ii) context
|
224 |
+
words associated with each emoji sense, which are inferred through word
|
225 |
+
embedding models trained over Google News corpus and a Twitter message corpus
|
226 |
+
for each emoji sense definition, and (iii) recognizing discrepancies in the
|
227 |
+
presentation of emoji on different platforms, specification of the most likely
|
228 |
+
platform-based emoji sense for a selected set of emoji. The dataset is hosted
|
229 |
+
as an open service with a REST API and is available at
|
230 |
+
http://emojinet.knoesis.org/. The development of this dataset, evaluation of
|
231 |
+
its quality, and its applications including emoji sense disambiguation and
|
232 |
+
emoji sense similarity are discussed.
|
233 |
+
|
234 |
+
---------------
|
235 |
+
|
236 |
+
### 22 Dec 2021 | [VoiceMoji: A Novel On-Device Pipeline for Seamless Emoji Insertion in Dictation](https://arxiv.org/abs/2112.12028) | [⬇️](https://arxiv.org/pdf/2112.12028)
|
237 |
+
*Sumit Kumar, Harichandana B S S, and Himanshu Arora*
|
238 |
+
|
239 |
+
Most of the speech recognition systems recover only words in the speech and
|
240 |
+
fail to capture emotions. Users have to manually add emoji(s) in text for
|
241 |
+
adding tone and making communication fun. Though there is much work done on
|
242 |
+
punctuation addition on transcribed speech, the area of emotion addition is
|
243 |
+
untouched. In this paper, we propose a novel on-device pipeline to enrich the
|
244 |
+
voice input experience. It involves, given a blob of transcribed text,
|
245 |
+
intelligently processing and identifying structure where emoji insertion makes
|
246 |
+
sense. Moreover, it includes semantic text analysis to predict emoji for each
|
247 |
+
of the sub-parts for which we propose a novel architecture Attention-based Char
|
248 |
+
Aware (ACA) LSTM which handles Out-Of-Vocabulary (OOV) words as well. All these
|
249 |
+
tasks are executed completely on-device and hence can aid on-device dictation
|
250 |
+
systems. To the best of our knowledge, this is the first work that shows how to
|
251 |
+
add emoji(s) in the transcribed text. We demonstrate that our components
|
252 |
+
achieve comparable results to previous neural approaches for punctuation
|
253 |
+
addition and emoji prediction with 80% fewer parameters. Overall, our proposed
|
254 |
+
model has a very small memory footprint of a mere 4MB to suit on-device
|
255 |
+
deployment.
|
256 |
+
|
257 |
+
---------------
|
258 |
+
|
259 |
+
### 06 Jul 2023 | [Caption Anything: Interactive Image Description with Diverse Multimodal Controls](https://arxiv.org/abs/2305.02677) | [⬇️](https://arxiv.org/pdf/2305.02677)
|
260 |
+
*Teng Wang, Jinrui Zhang, Junjie Fei, Hao Zheng, Yunlong Tang, Zhe Li, Mingqi Gao, Shanshan Zhao*
|
261 |
+
|
262 |
+
Controllable image captioning is an emerging multimodal topic that aims to
|
263 |
+
describe the image with natural language following human purpose,
|
264 |
+
$\textit{e.g.}$, looking at the specified regions or telling in a particular
|
265 |
+
text style. State-of-the-art methods are trained on annotated pairs of input
|
266 |
+
controls and output captions. However, the scarcity of such well-annotated
|
267 |
+
multimodal data largely limits their usability and scalability for interactive
|
268 |
+
AI systems. Leveraging unimodal instruction-following foundation models is a
|
269 |
+
promising alternative that benefits from broader sources of data. In this
|
270 |
+
paper, we present Caption AnyThing (CAT), a foundation model augmented image
|
271 |
+
captioning framework supporting a wide range of multimodel controls: 1) visual
|
272 |
+
controls, including points, boxes, and trajectories; 2) language controls, such
|
273 |
+
as sentiment, length, language, and factuality. Powered by Segment Anything
|
274 |
+
Model (SAM) and ChatGPT, we unify the visual and language prompts into a
|
275 |
+
modularized framework, enabling the flexible combination between different
|
276 |
+
controls. Extensive case studies demonstrate the user intention alignment
|
277 |
+
capabilities of our framework, shedding light on effective user interaction
|
278 |
+
modeling in vision-language applications. Our code is publicly available at
|
279 |
+
https://github.com/ttengwang/Caption-Anything.
|
280 |
+
|
281 |
+
---------------
|
282 |
+
|
283 |
+
### 12 Sep 2022 | [emojiSpace: Spatial Representation of Emojis](https://arxiv.org/abs/2209.09871) | [⬇️](https://arxiv.org/pdf/2209.09871)
|
284 |
+
*Moeen Mostafavi, Mahsa Pahlavikhah Varnosfaderani, Fateme Nikseresht, Seyed Ahmad Mansouri*
|
285 |
+
|
286 |
+
In the absence of nonverbal cues during messaging communication, users
|
287 |
+
express part of their emotions using emojis. Thus, having emojis in the
|
288 |
+
vocabulary of text messaging language models can significantly improve many
|
289 |
+
natural language processing (NLP) applications such as online communication
|
290 |
+
analysis. On the other hand, word embedding models are usually trained on a
|
291 |
+
very large corpus of text such as Wikipedia or Google News datasets that
|
292 |
+
include very few samples with emojis. In this study, we create emojiSpace,
|
293 |
+
which is a combined word-emoji embedding using the word2vec model from the
|
294 |
+
Genism library in Python. We trained emojiSpace on a corpus of more than 4
|
295 |
+
billion tweets and evaluated it by implementing sentiment analysis on a Twitter
|
296 |
+
dataset containing more than 67 million tweets as an extrinsic task. For this
|
297 |
+
task, we compared the performance of two different classifiers of random forest
|
298 |
+
(RF) and linear support vector machine (SVM). For evaluation, we compared
|
299 |
+
emojiSpace performance with two other pre-trained embeddings and demonstrated
|
300 |
+
that emojiSpace outperforms both.
|
301 |
+
|
302 |
+
---------------
|
303 |
+
|
304 |
+
### 18 May 2022 | [Graph Adaptive Semantic Transfer for Cross-domain Sentiment Classification](https://arxiv.org/abs/2205.08772) | [⬇️](https://arxiv.org/pdf/2205.08772)
|
305 |
+
*Kai Zhang, Qi Liu, Zhenya Huang, Mingyue Cheng, Kun Zhang, Mengdi Zhang, Wei Wu, Enhong Chen*
|
306 |
+
|
307 |
+
Cross-domain sentiment classification (CDSC) aims to use the transferable
|
308 |
+
semantics learned from the source domain to predict the sentiment of reviews in
|
309 |
+
the unlabeled target domain. Existing studies in this task attach more
|
310 |
+
attention to the sequence modeling of sentences while largely ignoring the rich
|
311 |
+
domain-invariant semantics embedded in graph structures (i.e., the
|
312 |
+
part-of-speech tags and dependency relations). As an important aspect of
|
313 |
+
exploring characteristics of language comprehension, adaptive graph
|
314 |
+
representations have played an essential role in recent years. To this end, in
|
315 |
+
the paper, we aim to explore the possibility of learning invariant semantic
|
316 |
+
features from graph-like structures in CDSC. Specifically, we present Graph
|
317 |
+
Adaptive Semantic Transfer (GAST) model, an adaptive syntactic graph embedding
|
318 |
+
method that is able to learn domain-invariant semantics from both word
|
319 |
+
sequences and syntactic graphs. More specifically, we first raise a
|
320 |
+
POS-Transformer module to extract sequential semantic features from the word
|
321 |
+
sequences as well as the part-of-speech tags. Then, we design a Hybrid Graph
|
322 |
+
Attention (HGAT) module to generate syntax-based semantic features by
|
323 |
+
considering the transferable dependency relations. Finally, we devise an
|
324 |
+
Integrated aDaptive Strategy (IDS) to guide the joint learning process of both
|
325 |
+
modules. Extensive experiments on four public datasets indicate that GAST
|
326 |
+
achieves comparable effectiveness to a range of state-of-the-art models.
|
327 |
+
|
328 |
+
---------------
|
329 |
+
|
330 |
+
### 03 Apr 2018 | [Contrastive Learning of Emoji-based Representations for Resource-Poor Languages](https://arxiv.org/abs/1804.01855) | [⬇️](https://arxiv.org/pdf/1804.01855)
|
331 |
+
*Nurendra Choudhary, Rajat Singh, Ishita Bindlish and Manish Shrivastava*
|
332 |
+
|
333 |
+
The introduction of emojis (or emoticons) in social media platforms has given
|
334 |
+
the users an increased potential for expression. We propose a novel method
|
335 |
+
called Classification of Emojis using Siamese Network Architecture (CESNA) to
|
336 |
+
learn emoji-based representations of resource-poor languages by jointly
|
337 |
+
training them with resource-rich languages using a siamese network.
|
338 |
+
CESNA model consists of twin Bi-directional Long Short-Term Memory Recurrent
|
339 |
+
Neural Networks (Bi-LSTM RNN) with shared parameters joined by a contrastive
|
340 |
+
loss function based on a similarity metric. The model learns the
|
341 |
+
representations of resource-poor and resource-rich language in a common emoji
|
342 |
+
space by using a similarity metric based on the emojis present in sentences
|
343 |
+
from both languages. The model, hence, projects sentences with similar emojis
|
344 |
+
closer to each other and the sentences with different emojis farther from one
|
345 |
+
another. Experiments on large-scale Twitter datasets of resource-rich languages
|
346 |
+
- English and Spanish and resource-poor languages - Hindi and Telugu reveal
|
347 |
+
that CESNA outperforms the state-of-the-art emoji prediction approaches based
|
348 |
+
on distributional semantics, semantic rules, lexicon lists and deep neural
|
349 |
+
network representations without shared parameters.
|
350 |
+
|
351 |
+
---------------
|
352 |
+
|
353 |
+
### 16 Feb 2023 | [VLTinT: Visual-Linguistic Transformer-in-Transformer for Coherent Video Paragraph Captioning](https://arxiv.org/abs/2211.15103) | [⬇️](https://arxiv.org/pdf/2211.15103)
|
354 |
+
*Kashu Yamazaki, Khoa Vo, Sang Truong, Bhiksha Raj, Ngan Le*
|
355 |
+
|
356 |
+
Video paragraph captioning aims to generate a multi-sentence description of
|
357 |
+
an untrimmed video with several temporal event locations in coherent
|
358 |
+
storytelling. Following the human perception process, where the scene is
|
359 |
+
effectively understood by decomposing it into visual (e.g. human, animal) and
|
360 |
+
non-visual components (e.g. action, relations) under the mutual influence of
|
361 |
+
vision and language, we first propose a visual-linguistic (VL) feature. In the
|
362 |
+
proposed VL feature, the scene is modeled by three modalities including (i) a
|
363 |
+
global visual environment; (ii) local visual main agents; (iii) linguistic
|
364 |
+
scene elements. We then introduce an autoregressive Transformer-in-Transformer
|
365 |
+
(TinT) to simultaneously capture the semantic coherence of intra- and
|
366 |
+
inter-event contents within a video. Finally, we present a new VL contrastive
|
367 |
+
loss function to guarantee learnt embedding features are matched with the
|
368 |
+
captions semantics. Comprehensive experiments and extensive ablation studies on
|
369 |
+
ActivityNet Captions and YouCookII datasets show that the proposed
|
370 |
+
Visual-Linguistic Transformer-in-Transform (VLTinT) outperforms prior
|
371 |
+
state-of-the-art methods on accuracy and diversity. Source code is made
|
372 |
+
publicly available at: https://github.com/UARK-AICV/VLTinT.
|
373 |
+
|
374 |
+
---------------
|
375 |
+
|
376 |
+
### 21 Jun 2023 | [EmTract: Extracting Emotions from Social Media](https://arxiv.org/abs/2112.03868) | [⬇️](https://arxiv.org/pdf/2112.03868)
|
377 |
+
*Domonkos F. Vamossy and Rolf Skog*
|
378 |
+
|
379 |
+
We develop an open-source tool (EmTract) that extracts emotions from social
|
380 |
+
media text tailed for financial context. To do so, we annotate ten thousand
|
381 |
+
short messages from a financial social media platform (StockTwits) and combine
|
382 |
+
it with open-source emotion data. We then use a pre-tuned NLP model,
|
383 |
+
DistilBERT, augment its embedding space by including 4,861 tokens (emojis and
|
384 |
+
emoticons), and then fit it first on the open-source emotion data, then
|
385 |
+
transfer it to our annotated financial social media data. Our model outperforms
|
386 |
+
competing open-source state-of-the-art emotion classifiers, such as Emotion
|
387 |
+
English DistilRoBERTa-base on both human and chatGPT annotated data. Compared
|
388 |
+
to dictionary based methods, our methodology has three main advantages for
|
389 |
+
research in finance. First, our model is tailored to financial social media
|
390 |
+
text; second, it incorporates key aspects of social media data, such as
|
391 |
+
non-standard phrases, emojis, and emoticons; and third, it operates by
|
392 |
+
sequentially learning a latent representation that includes features such as
|
393 |
+
word order, word usage, and local context. Using EmTract, we explore the
|
394 |
+
relationship between investor emotions expressed on social media and asset
|
395 |
+
prices. We show that firm-specific investor emotions are predictive of daily
|
396 |
+
price movements. Our findings show that emotions and market dynamics are
|
397 |
+
closely related, and we provide a tool to help study the role emotions play in
|
398 |
+
financial markets.
|
399 |
+
|
400 |
+
---------------
|
401 |
+
|
402 |
+
### 29 Oct 2022 | [Open-world Semantic Segmentation via Contrasting and Clustering Vision-Language Embedding](https://arxiv.org/abs/2207.08455) | [⬇️](https://arxiv.org/pdf/2207.08455)
|
403 |
+
*Quande Liu, Youpeng Wen, Jianhua Han, Chunjing Xu, Hang Xu, Xiaodan Liang*
|
404 |
+
|
405 |
+
To bridge the gap between supervised semantic segmentation and real-world
|
406 |
+
applications that acquires one model to recognize arbitrary new concepts,
|
407 |
+
recent zero-shot segmentation attracts a lot of attention by exploring the
|
408 |
+
relationships between unseen and seen object categories, yet requiring large
|
409 |
+
amounts of densely-annotated data with diverse base classes. In this paper, we
|
410 |
+
propose a new open-world semantic segmentation pipeline that makes the first
|
411 |
+
attempt to learn to segment semantic objects of various open-world categories
|
412 |
+
without any efforts on dense annotations, by purely exploiting the
|
413 |
+
image-caption data that naturally exist on the Internet. Our method,
|
414 |
+
Vision-language-driven Semantic Segmentation (ViL-Seg), employs an image and a
|
415 |
+
text encoder to generate visual and text embeddings for the image-caption data,
|
416 |
+
with two core components that endow its segmentation ability: First, the image
|
417 |
+
encoder is jointly trained with a vision-based contrasting and a cross-modal
|
418 |
+
contrasting, which encourage the visual embeddings to preserve both
|
419 |
+
fine-grained semantics and high-level category information that are crucial for
|
420 |
+
the segmentation task. Furthermore, an online clustering head is devised over
|
421 |
+
the image encoder, which allows to dynamically segment the visual embeddings
|
422 |
+
into distinct semantic groups such that they can be classified by comparing
|
423 |
+
with various text embeddings to complete our segmentation pipeline. Experiments
|
424 |
+
show that without using any data with dense annotations, our method can
|
425 |
+
directly segment objects of arbitrary categories, outperforming zero-shot
|
426 |
+
segmentation methods that require data labeling on three benchmark datasets.
|
427 |
+
|
428 |
+
---------------
|
429 |
+
|
430 |
+
### 19 Jan 2024 | [PoseScript: Linking 3D Human Poses and Natural Language](https://arxiv.org/abs/2210.11795) | [⬇️](https://arxiv.org/pdf/2210.11795)
|
431 |
+
*Ginger Delmas, Philippe Weinzaepfel, Thomas Lucas, Francesc Moreno-Noguer, Gr\'egory Rogez*
|
432 |
+
|
433 |
+
Natural language plays a critical role in many computer vision applications,
|
434 |
+
such as image captioning, visual question answering, and cross-modal retrieval,
|
435 |
+
to provide fine-grained semantic information. Unfortunately, while human pose
|
436 |
+
is key to human understanding, current 3D human pose datasets lack detailed
|
437 |
+
language descriptions. To address this issue, we have introduced the PoseScript
|
438 |
+
dataset. This dataset pairs more than six thousand 3D human poses from AMASS
|
439 |
+
with rich human-annotated descriptions of the body parts and their spatial
|
440 |
+
relationships. Additionally, to increase the size of the dataset to a scale
|
441 |
+
that is compatible with data-hungry learning algorithms, we have proposed an
|
442 |
+
elaborate captioning process that generates automatic synthetic descriptions in
|
443 |
+
natural language from given 3D keypoints. This process extracts low-level pose
|
444 |
+
information, known as "posecodes", using a set of simple but generic rules on
|
445 |
+
the 3D keypoints. These posecodes are then combined into higher level textual
|
446 |
+
descriptions using syntactic rules. With automatic annotations, the amount of
|
447 |
+
available data significantly scales up (100k), making it possible to
|
448 |
+
effectively pretrain deep models for finetuning on human captions. To showcase
|
449 |
+
the potential of annotated poses, we present three multi-modal learning tasks
|
450 |
+
that utilize the PoseScript dataset. Firstly, we develop a pipeline that maps
|
451 |
+
3D poses and textual descriptions into a joint embedding space, allowing for
|
452 |
+
cross-modal retrieval of relevant poses from large-scale datasets. Secondly, we
|
453 |
+
establish a baseline for a text-conditioned model generating 3D poses. Thirdly,
|
454 |
+
we present a learned process for generating pose descriptions. These
|
455 |
+
applications demonstrate the versatility and usefulness of annotated poses in
|
456 |
+
various tasks and pave the way for future research in the field.
|
457 |
+
|
458 |
+
---------------
|
459 |
+
|
460 |
+
### 11 Sep 2023 | [Tell me what you see: A zero-shot action recognition method based on natural language descriptions](https://arxiv.org/abs/2112.09976) | [⬇️](https://arxiv.org/pdf/2112.09976)
|
461 |
+
*Valter Estevam and Rayson Laroca and David Menotti and Helio Pedrini*
|
462 |
+
|
463 |
+
This paper presents a novel approach to Zero-Shot Action Recognition. Recent
|
464 |
+
works have explored the detection and classification of objects to obtain
|
465 |
+
semantic information from videos with remarkable performance. Inspired by them,
|
466 |
+
we propose using video captioning methods to extract semantic information about
|
467 |
+
objects, scenes, humans, and their relationships. To the best of our knowledge,
|
468 |
+
this is the first work to represent both videos and labels with descriptive
|
469 |
+
sentences. More specifically, we represent videos using sentences generated via
|
470 |
+
video captioning methods and classes using sentences extracted from documents
|
471 |
+
acquired through search engines on the Internet. Using these representations,
|
472 |
+
we build a shared semantic space employing BERT-based embedders pre-trained in
|
473 |
+
the paraphrasing task on multiple text datasets. The projection of both visual
|
474 |
+
and semantic information onto this space is straightforward, as they are
|
475 |
+
sentences, enabling classification using the nearest neighbor rule. We
|
476 |
+
demonstrate that representing videos and labels with sentences alleviates the
|
477 |
+
domain adaptation problem. Additionally, we show that word vectors are
|
478 |
+
unsuitable for building the semantic embedding space of our descriptions. Our
|
479 |
+
method outperforms the state-of-the-art performance on the UCF101 dataset by
|
480 |
+
3.3 p.p. in accuracy under the TruZe protocol and achieves competitive results
|
481 |
+
on both the UCF101 and HMDB51 datasets under the conventional protocol (0/50\%
|
482 |
+
- training/testing split). Our code is available at
|
483 |
+
https://github.com/valterlej/zsarcap.
|
484 |
+
|
485 |
+
---------------
|
486 |
+
|
487 |
+
### 16 Jun 2023 | [M3PT: A Multi-Modal Model for POI Tagging](https://arxiv.org/abs/2306.10079) | [⬇️](https://arxiv.org/pdf/2306.10079)
|
488 |
+
*Jingsong Yang, Guanzhou Han, Deqing Yang, Jingping Liu, Yanghua Xiao, Xiang Xu, Baohua Wu, Shenghua Ni*
|
489 |
+
|
490 |
+
POI tagging aims to annotate a point of interest (POI) with some informative
|
491 |
+
tags, which facilitates many services related to POIs, including search,
|
492 |
+
recommendation, and so on. Most of the existing solutions neglect the
|
493 |
+
significance of POI images and seldom fuse the textual and visual features of
|
494 |
+
POIs, resulting in suboptimal tagging performance. In this paper, we propose a
|
495 |
+
novel Multi-Modal Model for POI Tagging, namely M3PT, which achieves enhanced
|
496 |
+
POI tagging through fusing the target POI's textual and visual features, and
|
497 |
+
the precise matching between the multi-modal representations. Specifically, we
|
498 |
+
first devise a domain-adaptive image encoder (DIE) to obtain the image
|
499 |
+
embeddings aligned to their gold tags' semantics. Then, in M3PT's text-image
|
500 |
+
fusion module (TIF), the textual and visual representations are fully fused
|
501 |
+
into the POIs' content embeddings for the subsequent matching. In addition, we
|
502 |
+
adopt a contrastive learning strategy to further bridge the gap between the
|
503 |
+
representations of different modalities. To evaluate the tagging models'
|
504 |
+
performance, we have constructed two high-quality POI tagging datasets from the
|
505 |
+
real-world business scenario of Ali Fliggy. Upon the datasets, we conducted the
|
506 |
+
extensive experiments to demonstrate our model's advantage over the baselines
|
507 |
+
of uni-modality and multi-modality, and verify the effectiveness of important
|
508 |
+
components in M3PT, including DIE, TIF and the contrastive learning strategy.
|
509 |
+
|
510 |
+
---------------
|
511 |
+
|
512 |
+
### 10 Feb 2020 | [The Tensor Brain: Semantic Decoding for Perception and Memory](https://arxiv.org/abs/2001.11027) | [⬇️](https://arxiv.org/pdf/2001.11027)
|
513 |
+
*Volker Tresp and Sahand Sharifzadeh and Dario Konopatzki and Yunpu Ma*
|
514 |
+
|
515 |
+
We analyse perception and memory, using mathematical models for knowledge
|
516 |
+
graphs and tensors, to gain insights into the corresponding functionalities of
|
517 |
+
the human mind. Our discussion is based on the concept of propositional
|
518 |
+
sentences consisting of \textit{subject-predicate-object} (SPO) triples for
|
519 |
+
expressing elementary facts. SPO sentences are the basis for most natural
|
520 |
+
languages but might also be important for explicit perception and declarative
|
521 |
+
memories, as well as intra-brain communication and the ability to argue and
|
522 |
+
reason. A set of SPO sentences can be described as a knowledge graph, which can
|
523 |
+
be transformed into an adjacency tensor. We introduce tensor models, where
|
524 |
+
concepts have dual representations as indices and associated embeddings, two
|
525 |
+
constructs we believe are essential for the understanding of implicit and
|
526 |
+
explicit perception and memory in the brain. We argue that a biological
|
527 |
+
realization of perception and memory imposes constraints on information
|
528 |
+
processing. In particular, we propose that explicit perception and declarative
|
529 |
+
memories require a semantic decoder, which, in a simple realization, is based
|
530 |
+
on four layers: First, a sensory memory layer, as a buffer for sensory input,
|
531 |
+
second, an index layer representing concepts, third, a memoryless
|
532 |
+
representation layer for the broadcasting of information ---the "blackboard",
|
533 |
+
or the "canvas" of the brain--- and fourth, a working memory layer as a
|
534 |
+
processing center and data buffer. We discuss the operations of the four layers
|
535 |
+
and relate them to the global workspace theory. In a Bayesian brain
|
536 |
+
interpretation, semantic memory defines the prior for observable triple
|
537 |
+
statements. We propose that ---in evolution and during development--- semantic
|
538 |
+
memory, episodic memory, and natural language evolved as emergent properties in
|
539 |
+
agents' process to gain a deeper understanding of sensory information.
|
540 |
+
|
541 |
+
---------------
|
542 |
+
|
543 |
+
### 09 Sep 2021 | [Talk-to-Edit: Fine-Grained Facial Editing via Dialog](https://arxiv.org/abs/2109.04425) | [⬇️](https://arxiv.org/pdf/2109.04425)
|
544 |
+
*Yuming Jiang, Ziqi Huang, Xingang Pan, Chen Change Loy, Ziwei Liu*
|
545 |
+
|
546 |
+
Facial editing is an important task in vision and graphics with numerous
|
547 |
+
applications. However, existing works are incapable to deliver a continuous and
|
548 |
+
fine-grained editing mode (e.g., editing a slightly smiling face to a big
|
549 |
+
laughing one) with natural interactions with users. In this work, we propose
|
550 |
+
Talk-to-Edit, an interactive facial editing framework that performs
|
551 |
+
fine-grained attribute manipulation through dialog between the user and the
|
552 |
+
system. Our key insight is to model a continual "semantic field" in the GAN
|
553 |
+
latent space. 1) Unlike previous works that regard the editing as traversing
|
554 |
+
straight lines in the latent space, here the fine-grained editing is formulated
|
555 |
+
as finding a curving trajectory that respects fine-grained attribute landscape
|
556 |
+
on the semantic field. 2) The curvature at each step is location-specific and
|
557 |
+
determined by the input image as well as the users' language requests. 3) To
|
558 |
+
engage the users in a meaningful dialog, our system generates language feedback
|
559 |
+
by considering both the user request and the current state of the semantic
|
560 |
+
field.
|
561 |
+
We also contribute CelebA-Dialog, a visual-language facial editing dataset to
|
562 |
+
facilitate large-scale study. Specifically, each image has manually annotated
|
563 |
+
fine-grained attribute annotations as well as template-based textual
|
564 |
+
descriptions in natural language. Extensive quantitative and qualitative
|
565 |
+
experiments demonstrate the superiority of our framework in terms of 1) the
|
566 |
+
smoothness of fine-grained editing, 2) the identity/attribute preservation, and
|
567 |
+
3) the visual photorealism and dialog fluency. Notably, user study validates
|
568 |
+
that our overall system is consistently favored by around 80% of the
|
569 |
+
participants. Our project page is https://www.mmlab-ntu.com/project/talkedit/.
|
570 |
+
|
571 |
+
---------------<s>[INST] Context:
|
572 |
+
1. <b> Joint Representation Learning for Text and 3D Point Cloud </b>
|
573 |
+
Abstract: Recent advancements in vision-language pre-training (e.g. CLIP) have shown
|
574 |
+
that vision models can benefit from language supervision. While many models
|
575 |
+
using language modality have achieved great success on 2D vision tasks, the
|
576 |
+
joint representation learning of 3D point cloud with text remains
|
577 |
+
under-explored due to the difficulty of 3D-Text data pair acquisition and the
|
578 |
+
irregularity of 3D data structure. In this paper, we propose a novel Text4Point
|
579 |
+
framework to construct language-guided 3D point cloud models. The key idea is
|
580 |
+
utilizing 2D images as a bridge to connect the point cloud and the language
|
581 |
+
modalities. The proposed Text4Point follows the pre-training and fine-tuning
|
582 |
+
paradigm. During the pre-training stage, we establish the correspondence of
|
583 |
+
images and point clouds based on the readily available RGB-D data and use
|
584 |
+
contrastive learning to align the image and point cloud representations.
|
585 |
+
Together with the well-aligned image and text features achieved by CLIP, the
|
586 |
+
point cloud features are implicitly aligned with the text embeddings. Further,
|
587 |
+
we propose a Text Querying Module to integrate language information into 3D
|
588 |
+
representation learning by querying text embeddings with point cloud features.
|
589 |
+
For fine-tuning, the model learns task-specific 3D representations under
|
590 |
+
informative language guidance from the label set without 2D images. Extensive
|
591 |
+
experiments demonstrate that our model shows consistent improvement on various
|
592 |
+
downstream tasks, such as point cloud semantic segmentation, instance
|
593 |
+
segmentation, and object detection. The code will be available here:
|
594 |
+
https://github.com/LeapLabTHU/Text4Point
|
595 |
+
2. <b> A Semantics-Based Measure of Emoji Similarity </b>
|
596 |
+
Abstract: Emoji have grown to become one of the most important forms of communication
|
597 |
+
on the web. With its widespread use, measuring the similarity of emoji has
|
598 |
+
become an important problem for contemporary text processing since it lies at
|
599 |
+
the heart of sentiment analysis, search, and interface design tasks. This paper
|
600 |
+
presents a comprehensive analysis of the semantic similarity of emoji through
|
601 |
+
embedding models that are learned over machine-readable emoji meanings in the
|
602 |
+
EmojiNet knowledge base. Using emoji descriptions, emoji sense labels and emoji
|
603 |
+
sense definitions, and with different training corpora obtained from Twitter
|
604 |
+
and Google News, we develop and test multiple embedding models to measure emoji
|
605 |
+
similarity. To evaluate our work, we create a new dataset called EmoSim508,
|
606 |
+
which assigns human-annotated semantic similarity scores to a set of 508
|
607 |
+
carefully selected emoji pairs. After validation with EmoSim508, we present a
|
608 |
+
real-world use-case of our emoji embedding models using a sentiment analysis
|
609 |
+
task and show that our models outperform the previous best-performing emoji
|
610 |
+
embedding model on this task. The EmoSim508 dataset and our emoji embedding
|
611 |
+
models are publicly released with this paper and can be downloaded from
|
612 |
+
http://emojinet.knoesis.org/.
|
613 |
+
3. <b> Semantic MapNet: Building Allocentric Semantic Maps and Representations from Egocentric Views </b>
|
614 |
+
Abstract: We study the task of semantic mapping - specifically, an embodied agent (a
|
615 |
+
robot or an egocentric AI assistant) is given a tour of a new environment and
|
616 |
+
asked to build an allocentric top-down semantic map ("what is where?") from
|
617 |
+
egocentric observations of an RGB-D camera with known pose (via localization
|
618 |
+
sensors). Towards this goal, we present SemanticMapNet (SMNet), which consists
|
619 |
+
of: (1) an Egocentric Visual Encoder that encodes each egocentric RGB-D frame,
|
620 |
+
(2) a Feature Projector that projects egocentric features to appropriate
|
621 |
+
locations on a floor-plan, (3) a Spatial Memory Tensor of size floor-plan
|
622 |
+
length x width x feature-dims that learns to accumulate projected egocentric
|
623 |
+
features, and (4) a Map Decoder that uses the memory tensor to produce semantic
|
624 |
+
top-down maps. SMNet combines the strengths of (known) projective camera
|
625 |
+
geometry and neural representation learning. On the task of semantic mapping in
|
626 |
+
the Matterport3D dataset, SMNet significantly outperforms competitive baselines
|
627 |
+
by 4.01-16.81% (absolute) on mean-IoU and 3.81-19.69% (absolute) on Boundary-F1
|
628 |
+
metrics. Moreover, we show how to use the neural episodic memories and
|
629 |
+
spatio-semantic allocentric representations build by SMNet for subsequent tasks
|
630 |
+
in the same space - navigating to objects seen during the tour("Find chair") or
|
631 |
+
answering questions about the space ("How many chairs did you see in the
|
632 |
+
house?"). Project page: https://vincentcartillier.github.io/smnet.html.
|
633 |
+
4. <b> Distributional semantic modeling: a revised technique to train term/word vector space models applying the ontology-related approach </b>
|
634 |
+
Abstract: We design a new technique for the distributional semantic modeling with a
|
635 |
+
neural network-based approach to learn distributed term representations (or
|
636 |
+
term embeddings) - term vector space models as a result, inspired by the recent
|
637 |
+
ontology-related approach (using different types of contextual knowledge such
|
638 |
+
as syntactic knowledge, terminological knowledge, semantic knowledge, etc.) to
|
639 |
+
the identification of terms (term extraction) and relations between them
|
640 |
+
(relation extraction) called semantic pre-processing technology - SPT. Our
|
641 |
+
method relies on automatic term extraction from the natural language texts and
|
642 |
+
subsequent formation of the problem-oriented or application-oriented (also
|
643 |
+
deeply annotated) text corpora where the fundamental entity is the term
|
644 |
+
(includes non-compositional and compositional terms). This gives us an
|
645 |
+
opportunity to changeover from distributed word representations (or word
|
646 |
+
embeddings) to distributed term representations (or term embeddings). This
|
647 |
+
transition will allow to generate more accurate semantic maps of different
|
648 |
+
subject domains (also, of relations between input terms - it is useful to
|
649 |
+
explore clusters and oppositions, or to test your hypotheses about them). The
|
650 |
+
semantic map can be represented as a graph using Vec2graph - a Python library
|
651 |
+
for visualizing word embeddings (term embeddings in our case) as dynamic and
|
652 |
+
interactive graphs. The Vec2graph library coupled with term embeddings will not
|
653 |
+
only improve accuracy in solving standard NLP tasks, but also update the
|
654 |
+
conventional concept of automated ontology development. The main practical
|
655 |
+
result of our work is the development kit (set of toolkits represented as web
|
656 |
+
service APIs and web application), which provides all necessary routines for
|
657 |
+
the basic linguistic pre-processing and the semantic pre-processing of the
|
658 |
+
natural language texts in Ukrainian for future training of term vector space
|
659 |
+
models.
|
660 |
+
5. <b> Lexi: Self-Supervised Learning of the UI Language </b>
|
661 |
+
Abstract: Humans can learn to operate the user interface (UI) of an application by
|
662 |
+
reading an instruction manual or how-to guide. Along with text, these resources
|
663 |
+
include visual content such as UI screenshots and images of application icons
|
664 |
+
referenced in the text. We explore how to leverage this data to learn generic
|
665 |
+
visio-linguistic representations of UI screens and their components. These
|
666 |
+
representations are useful in many real applications, such as accessibility,
|
667 |
+
voice navigation, and task automation. Prior UI representation models rely on
|
668 |
+
UI metadata (UI trees and accessibility labels), which is often missing,
|
669 |
+
incompletely defined, or not accessible. We avoid such a dependency, and
|
670 |
+
propose Lexi, a pre-trained vision and language model designed to handle the
|
671 |
+
unique features of UI screens, including their text richness and context
|
672 |
+
sensitivity. To train Lexi we curate the UICaption dataset consisting of 114k
|
673 |
+
UI images paired with descriptions of their functionality. We evaluate Lexi on
|
674 |
+
four tasks: UI action entailment, instruction-based UI image retrieval,
|
675 |
+
grounding referring expressions, and UI entity recognition.
|
676 |
+
6. <b> ScreenAI: A Vision-Language Model for UI and Infographics Understanding </b>
|
677 |
+
Abstract: Screen user interfaces (UIs) and infographics, sharing similar visual
|
678 |
+
language and design principles, play important roles in human communication and
|
679 |
+
human-machine interaction. We introduce ScreenAI, a vision-language model that
|
680 |
+
specializes in UI and infographics understanding. Our model improves upon the
|
681 |
+
PaLI architecture with the flexible patching strategy of pix2struct and is
|
682 |
+
trained on a unique mixture of datasets. At the heart of this mixture is a
|
683 |
+
novel screen annotation task in which the model has to identify the type and
|
684 |
+
location of UI elements. We use these text annotations to describe screens to
|
685 |
+
Large Language Models and automatically generate question-answering (QA), UI
|
686 |
+
navigation, and summarization training datasets at scale. We run ablation
|
687 |
+
studies to demonstrate the impact of these design choices. At only 5B
|
688 |
+
parameters, ScreenAI achieves new state-of-the-artresults on UI- and
|
689 |
+
infographics-based tasks (Multi-page DocVQA, WebSRC, MoTIF and Widget
|
690 |
+
Captioning), and new best-in-class performance on others (Chart QA, DocVQA, and
|
691 |
+
InfographicVQA) compared to models of similar size. Finally, we release three
|
692 |
+
new datasets: one focused on the screen annotation task and two others focused
|
693 |
+
on question answering.
|
694 |
+
7. <b> EmojiNet: An Open Service and API for Emoji Sense Discovery </b>
|
695 |
+
Abstract: This paper presents the release of EmojiNet, the largest machine-readable
|
696 |
+
emoji sense inventory that links Unicode emoji representations to their English
|
697 |
+
meanings extracted from the Web. EmojiNet is a dataset consisting of: (i)
|
698 |
+
12,904 sense labels over 2,389 emoji, which were extracted from the web and
|
699 |
+
linked to machine-readable sense definitions seen in BabelNet, (ii) context
|
700 |
+
words associated with each emoji sense, which are inferred through word
|
701 |
+
embedding models trained over Google News corpus and a Twitter message corpus
|
702 |
+
for each emoji sense definition, and (iii) recognizing discrepancies in the
|
703 |
+
presentation of emoji on different platforms, specification of the most likely
|
704 |
+
platform-based emoji sense for a selected set of emoji. The dataset is hosted
|
705 |
+
as an open service with a REST API and is available at
|
706 |
+
http://emojinet.knoesis.org/. The development of this dataset, evaluation of
|
707 |
+
its quality, and its applications including emoji sense disambiguation and
|
708 |
+
emoji sense similarity are discussed.
|
709 |
+
8. <b> VoiceMoji: A Novel On-Device Pipeline for Seamless Emoji Insertion in Dictation </b>
|
710 |
+
Abstract: Most of the speech recognition systems recover only words in the speech and
|
711 |
+
fail to capture emotions. Users have to manually add emoji(s) in text for
|
712 |
+
adding tone and making communication fun. Though there is much work done on
|
713 |
+
punctuation addition on transcribed speech, the area of emotion addition is
|
714 |
+
untouched. In this paper, we propose a novel on-device pipeline to enrich the
|
715 |
+
voice input experience. It involves, given a blob of transcribed text,
|
716 |
+
intelligently processing and identifying structure where emoji insertion makes
|
717 |
+
sense. Moreover, it includes semantic text analysis to predict emoji for each
|
718 |
+
of the sub-parts for which we propose a novel architecture Attention-based Char
|
719 |
+
Aware (ACA) LSTM which handles Out-Of-Vocabulary (OOV) words as well. All these
|
720 |
+
tasks are executed completely on-device and hence can aid on-device dictation
|
721 |
+
systems. To the best of our knowledge, this is the first work that shows how to
|
722 |
+
add emoji(s) in the transcribed text. We demonstrate that our components
|
723 |
+
achieve comparable results to previous neural approaches for punctuation
|
724 |
+
addition and emoji prediction with 80% fewer parameters. Overall, our proposed
|
725 |
+
model has a very small memory footprint of a mere 4MB to suit on-device
|
726 |
+
deployment.
|
727 |
+
9. <b> Caption Anything: Interactive Image Description with Diverse Multimodal Controls </b>
|
728 |
+
Abstract: Controllable image captioning is an emerging multimodal topic that aims to
|
729 |
+
describe the image with natural language following human purpose,
|
730 |
+
$\textit{e.g.}$, looking at the specified regions or telling in a particular
|
731 |
+
text style. State-of-the-art methods are trained on annotated pairs of input
|
732 |
+
controls and output captions. However, the scarcity of such well-annotated
|
733 |
+
multimodal data largely limits their usability and scalability for interactive
|
734 |
+
AI systems. Leveraging unimodal instruction-following foundation models is a
|
735 |
+
promising alternative that benefits from broader sources of data. In this
|
736 |
+
paper, we present Caption AnyThing (CAT), a foundation model augmented image
|
737 |
+
captioning framework supporting a wide range of multimodel controls: 1) visual
|
738 |
+
controls, including points, boxes, and trajectories; 2) language controls, such
|
739 |
+
as sentiment, length, language, and factuality. Powered by Segment Anything
|
740 |
+
Model (SAM) and ChatGPT, we unify the visual and language prompts into a
|
741 |
+
modularized framework, enabling the flexible combination between different
|
742 |
+
controls. Extensive case studies demonstrate the user intention alignment
|
743 |
+
capabilities of our framework, shedding light on effective user interaction
|
744 |
+
modeling in vision-language applications. Our code is publicly available at
|
745 |
+
https://github.com/ttengwang/Caption-Anything.
|
746 |
+
10. <b> emojiSpace: Spatial Representation of Emojis </b>
|
747 |
+
Abstract: In the absence of nonverbal cues during messaging communication, users
|
748 |
+
express part of their emotions using emojis. Thus, having emojis in the
|
749 |
+
vocabulary of text messaging language models can significantly improve many
|
750 |
+
natural language processing (NLP) applications such as online communication
|
751 |
+
analysis. On the other hand, word embedding models are usually trained on a
|
752 |
+
very large corpus of text such as Wikipedia or Google News datasets that
|
753 |
+
include very few samples with emojis. In this study, we create emojiSpace,
|
754 |
+
which is a combined word-emoji embedding using the word2vec model from the
|
755 |
+
Genism library in Python. We trained emojiSpace on a corpus of more than 4
|
756 |
+
billion tweets and evaluated it by implementing sentiment analysis on a Twitter
|
757 |
+
dataset containing more than 67 million tweets as an extrinsic task. For this
|
758 |
+
task, we compared the performance of two different classifiers of random forest
|
759 |
+
(RF) and linear support vector machine (SVM). For evaluation, we compared
|
760 |
+
emojiSpace performance with two other pre-trained embeddings and demonstrated
|
761 |
+
that emojiSpace outperforms both.
|
762 |
+
""")
|
763 |
+
|
764 |
+
|