sc{Ulyanov2016, author = {Ulyanov, Dmitry and Lebedev, Vadim}, title = {{Audio texture synthesis and style transfer}}, urldate = {November 3, 2017}, year = {2016} } @inproceedings{Wyse2017, abstract = {One of the decisions that arise when designing a neural network for any application is how the data should be represented in order to be presented to, and possibly generated by, a neural network. For audio, the choice is less obvious than it seems to be for visual images, and a variety of representations have been used for different applications including the raw digitized sample stream, hand-crafted features, machine discovered features, MFCCs and variants that include deltas, and a variety of spectral representations. This paper reviews some of these representations and issues that arise, focusing particularly on spectrograms for generating audio using neural networks for style transfer.}, archivePrefix = {arXiv}, arxivId = {1706.09559}, author = {Wyse, L.}, booktitle = {Proceedings of the First International Workshop on Deep Learning and Music joint with IJCNN}, eprint = {1706.09559}, file = {:Users/pkmital/Documents/PDFs/Wyse/Wyse - 2017 - Audio Spectrogram Representations for Processing with Convolutional Neural Networks.pdf:pdf}, keywords = {data representation,sound synthesis,spectrograms,style transfer}, number = {1}, pages = {37--41}, title = {{Audio Spectrogram Representations for Processing with Convolutional Neural Networks}}, url = {http://arxiv.org/abs/1706.09559}, volume = {1}, year = {2017} } @article{Ustyuzhaninov2016, abstract = {Here we demonstrate that the feature space of random shallow convolutional neural networks (CNNs) can serve as a surprisingly good model of natural textures. Patches from the same texture are consistently classified as being more similar then patches from different textures. Samples synthesized from the model capture spatial correlations on scales much larger then the receptive field size, and sometimes even rival or surpass the perceptual quality of state of the art texture models (but show less variability). The current state of the art in parametric texture synthesis relies on the multi-layer feature space of deep CNNs that were trained on natural images. Our finding suggests that such optimized multi-layer feature spaces are not imperative for texture modeling. Instead, much simpler shallow and convolutional networks can serve as the basis for novel texture synthesis algorithms.}, archivePrefix = {arXiv}, arxivId = {1606.00021}, author = {Ustyuzhaninov, Ivan and Brendel, Wieland and Gatys, Leon A. and Bethge, Matthias}, eprint = {1606.00021}, file = {:Users/pkmital/Documents/PDFs/Ustyuzhaninov et al/Ustyuzhaninov et al. - 2016 - Texture Synthesis Using Shallow Convolutional Networks with Random Filters.pdf:pdf}, journal = {Arxiv}, pages = {1--9}, title = {{Texture Synthesis Using Shallow Convolutional Networks with Random Filters}}, url = {http://arxiv.org/abs/1606.00021}, year = {2016} } @article{Gatys, archivePrefix = {arXiv}, arxivId = {arXiv:1508.06576v2}, author = {Gatys, Leon A and Ecker, Alexander S and Bethge, Matthias and Sep, C V}, eprint = {arXiv:1508.06576v2}, file = {:Users/pkmital/Documents/PDFs/Gatys et al/Gatys et al. - 2015 - A Neural Algorithm of Artistic Style.pdf:pdf}, journal = {Arxiv}, pages = {211839}, title = {{A Neural Algorithm of Artistic Style}}, year = {2015} } @article{Prusa2017, author = {Prů{\v{s}}a, Zden{\v{e}}k and Rajmic, Pavel}, doi = {10.1109/LSP.2017.2696970}, file = {:Users/pkmital/Documents/PDFs/Prů{\v{s}}a, Rajmic/Prů{\v{s}}a, Rajmic - 2017 - Toward High-Quality Real-Time Signal Reconstruction from STFT Magnitude.pdf:pdf}, issn = {10709908}, journal = {IEEE Signal Processing Letters}, keywords = {Phase reconstruction,real-time,short-time Fourier transform (STFT),spectrogram,time-frequency}, mendeley-groups = {nips-2017-audio-style}, number = {6}, pages = {892--896}, title = {{Toward High-Quality Real-Time Signal Reconstruction from STFT Magnitude}}, volume = {24}, year = {2017} } @article{Griffin1984, author = {Griffin, Daniel W and Lim, Jae S}, file = {:Users/pkmital/Documents/PDFs/Griffin, Lim/Griffin, Lim - 1984 - Signal Estimation from Modified Short-Time Fourier Transform.pdf:pdf}, journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing}, mendeley-groups = {nips-2017-audio-style}, number = {2}, pages = {236--243}, title = {{Signal Estimation from Modified Short-Time Fourier Transform}}, volume = {32}, year = {1984} } @inproceedings{Engel2017, abstract = {Generative models in vision have seen rapid progress due to algorithmic improvements and the availability of high-quality image datasets. In this paper, we offer contributions in both these areas to enable similar progress in audio modeling. First, we detail a powerful new WaveNet-style autoencoder model that conditions an autoregressive decoder on temporal codes learned from the raw audio waveform. Second, we introduce NSynth, a large-scale and high-quality dataset of musical notes that is an order of magnitude larger than comparable public datasets. Using NSynth, we demonstrate improved qualitative and quantitative performance of the WaveNet autoencoder over a well-tuned spectral autoencoder baseline. Finally, we show that the model learns a manifold of embeddings that allows for morphing between instruments, meaningfully interpolating in timbre to create new types of sounds that are realistic and expressive.}, archivePrefix = {arXiv}, arxivId = {1704.01279}, author = {Engel, Jesse and Resnick, Cinjon and Roberts, Adam and Dieleman, Sander and Eck, Douglas and Simonyan, Karen and Norouzi, Mohammad}, booktitle = {Proceedings of the 34th International Conference on Machine Learning}, eprint = {1704.01279}, file = {:Users/pkmital/Documents/PDFs/Engel et al/Engel et al. - 2017 - Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders(2).pdf:pdf}, mendeley-groups = {nips-2017-audio-style}, title = {{Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders}}, url = {http://arxiv.org/abs/1704.01279}, year = {2017} } @article{Oord2016b, abstract = {This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-the-art performance, with human listeners rating it as significantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal fidelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we find that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition.}, archivePrefix = {arXiv}, arxivId = {1609.03499}, author = {van den Oord, Aaron and Dieleman, Sander and Zen, Heiga and Simonyan, Karen and Vinyals, Oriol and Graves, Alex and Kalchbrenner, Nal and Senior, Andrew and Kavukcuoglu, Koray}, eprint = {1609.03499}, file = {:Users/pkmital/Documents/PDFs/Oord et al/Oord et al. - 2016 - WaveNet A Generative Model for Raw Audio.pdf:pdf}, journal = {arxiv}, mendeley-groups = {Neural Audio}, pages = {1--15}, title = {{WaveNet: A Generative Model for Raw Audio}}, url = {http://arxiv.org/abs/1609.03499}, year = {2016} } @article{Hershey2016, abstract = {Convolutional Neural Networks (CNNs) have proven very effective in image classification and show promise for audio. We use various CNN architectures to classify the soundtracks of a dataset of 70M training videos (5.24 million hours) with 30,871 video-level labels. We examine fully connected Deep Neural Networks (DNNs), AlexNet [1], VGG [2], Inception [3], and ResNet [4]. We investigate varying the size of both training set and label vocabulary, finding that analogs of the CNNs used in image classification do well on our audio classification task, and larger training and label sets help up to a point. A model using embeddings from these classifiers does much better than raw features on the Audio Set [5] Acoustic Event Detection (AED) classification task.}, archivePrefix = {arXiv}, arxivId = {1609.09430}, author = {Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin}, eprint = {1609.09430}, file = {:Users/pkmital/Documents/PDFs/Hershey et al/Hershey et al. - 2016 - CNN Architectures for Large-Scale Audio Classification.pdf:pdf}, isbn = {9781509041176}, journal = {International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, mendeley-groups = {Embodied Cognition,nips-2017-audio-style}, pages = {4--8}, title = {{CNN Architectures for Large-Scale Audio Classification}}, url = {http://arxiv.org/abs/1609.09430}, year = {2016} } @article{Ulyanov2016b, abstract = {Gatys et al. recently demonstrated that deep networks can generate beautiful textures and stylized images from a single texture example. However, their methods requires a slow and memory-consuming optimization process. We propose here an alternative approach that moves the computational burden to a learning stage. Given a single example of a texture, our approach trains compact feed-forward convolutional networks to generate multiple samples of the same texture of arbitrary size and to transfer artistic style from a given image to any other image. The resulting networks are remarkably light-weight and can generate textures of quality comparable to Gatys{\~{}}et{\~{}}al., but hundreds of times faster. More generally, our approach highlights the power and flexibility of generative feed-forward models trained with complex and expressive loss functions.}, archivePrefix = {arXiv}, arxivId = {1603.03417}, author = {Ulyanov, Dmitry and Lebedev, Vadim and Vedaldi, Andrea and Lempitsky, Victor}, eprint = {1603.03417}, file = {:Users/pkmital/Documents/PDFs/Ulyanov et al/Ulyanov et al. - 2016 - Texture Networks Feed-forward Synthesis of Textures and Stylized Images.pdf:pdf}, isbn = {9781510829008}, issn = {1938-7228}, mendeley-groups = {nips-2017-audio-style}, title = {{Texture Networks: Feed-forward Synthesis of Textures and Stylized Images}}, url = {http://arxiv.org/abs/1603.03417}, year = {2016} }