2020 |
|
31. | Paier, Wolfgang; Hilsmann, Anna; Eisert, Peter Interactive Facial Animation with Deep Neural Networks Journal Article Forthcoming IET Computer Vision, 2020., Forthcoming. @article{33d, title = {Interactive Facial Animation with Deep Neural Networks}, author = {Wolfgang Paier and Anna Hilsmann and Peter Eisert}, year = {2020}, date = {2020-12-31}, journal = {IET Computer Vision, 2020.}, abstract = {Creating realistic animations of human faces is still a challenging task in computer graphics. While CG models capture much variability in a small parameter vector, they usually do not meet the necessary visual quality. This is due to the fact, that geometry-based animation often does not allow fine-grained deformations and fails in difficult areas (mouth, eyes) to produce realistic renderings. Image-based animation techniques avoid these problems by using dynamic textures that capture details and small movements that are not explained by geometry. This comes at the cost of high memory requirements and limited flexibility in terms of animation, because dynamic texture sequences need to be concatenated seamlessly, which is not always possible and prone to visual artefacts.In this paper, we present a new hybrid animation framework that exploits recent advances in deep learning to provide an interactive animation engine that can be used via a simple and intuitive visualization for facial expression editing. We describe an automatic pipeline to generate training sequences that consist of dynamic textures plus sequences of consistent-3D face models. Based on this data, we train a variational auto-encoder to learn a low dimensional latent space of facial expressions, that is used for interactive facial animation.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {article} } Creating realistic animations of human faces is still a challenging task in computer graphics. While CG models capture much variability in a small parameter vector, they usually do not meet the necessary visual quality. This is due to the fact, that geometry-based animation often does not allow fine-grained deformations and fails in difficult areas (mouth, eyes) to produce realistic renderings. Image-based animation techniques avoid these problems by using dynamic textures that capture details and small movements that are not explained by geometry. This comes at the cost of high memory requirements and limited flexibility in terms of animation, because dynamic texture sequences need to be concatenated seamlessly, which is not always possible and prone to visual artefacts.In this paper, we present a new hybrid animation framework that exploits recent advances in deep learning to provide an interactive animation engine that can be used via a simple and intuitive visualization for facial expression editing. We describe an automatic pipeline to generate training sequences that consist of dynamic textures plus sequences of consistent-3D face models. Based on this data, we train a variational auto-encoder to learn a low dimensional latent space of facial expressions, that is used for interactive facial animation. |
30. | Upadrasta, Vaishnavi; Oehme, Astrid Media Technology Development: A Team Study with User Focus Inproceedings Forthcoming Languages & the Media 2020, the 13th International Conference on Language Transfer in Audiovisual Media, Forthcoming. @inproceedings{Upadrasta2020b, title = { Media Technology Development: A Team Study with User Focus}, author = {Vaishnavi Upadrasta and Astrid Oehme}, year = {2020}, date = {2020-12-31}, booktitle = {Languages & the Media 2020, the 13th International Conference on Language Transfer in Audiovisual Media}, keywords = {}, pubstate = {forthcoming}, tppubtype = {inproceedings} } |
29. | Oehme, Astrid; Upadrasta, Vaishnavi; Kotsch, Philipp Development of a Multilingual Questionnaire for the Deaf Community – Guidelines and Challenges Inproceedings Stephanidis, Constantine; Antona, Margherita; Gao, Qin; Zhou, Jia (Ed.): HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design, pp. 103–113, Springer International Publishing, Cham, 2020, ISBN: 978-3-030-60149-2. @inproceedings{Oehme2020, title = {Development of a Multilingual Questionnaire for the Deaf Community – Guidelines and Challenges}, author = {Astrid Oehme and Vaishnavi Upadrasta and Philipp Kotsch}, editor = {Constantine Stephanidis and Margherita Antona and Qin Gao and Jia Zhou}, doi = {https://doi.org/10.1007/978-3-030-60149-2_9}, isbn = {978-3-030-60149-2}, year = {2020}, date = {2020-09-25}, booktitle = {HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design}, pages = {103--113}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {To understand user requirements and needs of deaf TV consumers in the project CONTENT4ALL, and in order to set specifications and standardizations for an acceptable TV layout design, an online survey was conducted during the project's user research. The paper describes the development of a four-stage online questionnaire accessible for deaf participants, which includes demographic data, assessments of single layout questions as well as a layout do-it-yourself-puzzle. The insights gathered during the survey's development have been solidified into a draft guideline for similar user-research approaches, which will be briefly described. The survey was implemented using vue.js framework in conjunction with the associated state management vuex, and realized in five European sign languages and spoken languages in accordance with the consortium composition. Participants were asked to rate 36 possible layout combinations based on their preferences of the position and size of the different objects and subjects that are depicted, and then were given the opportunity to create their own layout as per their liking. The paper concludes with a report of first findings and highlights the challenges faced during the preparation process.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } To understand user requirements and needs of deaf TV consumers in the project CONTENT4ALL, and in order to set specifications and standardizations for an acceptable TV layout design, an online survey was conducted during the project's user research. The paper describes the development of a four-stage online questionnaire accessible for deaf participants, which includes demographic data, assessments of single layout questions as well as a layout do-it-yourself-puzzle. The insights gathered during the survey's development have been solidified into a draft guideline for similar user-research approaches, which will be briefly described. The survey was implemented using vue.js framework in conjunction with the associated state management vuex, and realized in five European sign languages and spoken languages in accordance with the consortium composition. Participants were asked to rate 36 possible layout combinations based on their preferences of the position and size of the different objects and subjects that are depicted, and then were given the opportunity to create their own layout as per their liking. The paper concludes with a report of first findings and highlights the challenges faced during the preparation process. |
28. | Upadrasta, Vaishnavi; Oehme, Astrid; Böhm, Sandra User-Centered Design for Accessibility in Media Content – Sign Language and Virtual Signer Inproceedings Stephanidis, Constantine; Antona, Margherita; Gao, Qin; Zhou, Jia (Ed.): HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design, pp. 126–143, Springer International Publishing, Cham, 2020, ISBN: 978-3-030-60149-2. @inproceedings{Upadrasta2020, title = {User-Centered Design for Accessibility in Media Content – Sign Language and Virtual Signer}, author = {Vaishnavi Upadrasta and Astrid Oehme and Sandra Böhm }, editor = {Constantine Stephanidis and Margherita Antona and Qin Gao and Jia Zhou}, doi = {https://doi.org/10.1007/978-3-030-60149-2_11}, isbn = {978-3-030-60149-2}, year = {2020}, date = {2020-09-25}, booktitle = {HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design}, pages = {126--143}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {Even though User-Centred Design (UCD) is widely accepted and employed in the design and development of interactive systems, there are limited guiding procedures on how such approaches can be translated to the development of products, systems, and services that are focusing primarily on accessibility. This paper reports on the application of UCD for such a system within an EU project CONTENT4ALL under the domain accessibility for the Deaf. Each step in the UCD process and the respective activities within each step are described in detail with a focus on the methods and techniques adopted for assisting in the development of the novel technology. The insights gained during the entirety of the user-centred design and evaluation process have led to a compilation of important factors for creating sign-translated media content. This possible guideline comprises a list of useful and necessary components pertaining to sign language delivery in media, particularly sign television. The objective of this paper is to highlight lessons learned presented in the form of recommendations for human factors researchers on key UCD procedures for the development of accessibility products, systems and services based on the performed user activities within the project. An attempt has been made to reduce the gap in literature and add to a possible UCD guiding process exclusively for accessibility.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Even though User-Centred Design (UCD) is widely accepted and employed in the design and development of interactive systems, there are limited guiding procedures on how such approaches can be translated to the development of products, systems, and services that are focusing primarily on accessibility. This paper reports on the application of UCD for such a system within an EU project CONTENT4ALL under the domain accessibility for the Deaf. Each step in the UCD process and the respective activities within each step are described in detail with a focus on the methods and techniques adopted for assisting in the development of the novel technology. The insights gained during the entirety of the user-centred design and evaluation process have led to a compilation of important factors for creating sign-translated media content. This possible guideline comprises a list of useful and necessary components pertaining to sign language delivery in media, particularly sign television. The objective of this paper is to highlight lessons learned presented in the form of recommendations for human factors researchers on key UCD procedures for the development of accessibility products, systems and services based on the performed user activities within the project. An attempt has been made to reduce the gap in literature and add to a possible UCD guiding process exclusively for accessibility. |
27. | Camgöz, Necati Cihan; Koller, Oscar; Hadfield, Simon; Bowden, Richard Multi-channel Transformers for Multi-articulatory Sign Language Translation Inproceedings 16th European Conference on Computer Vision (ECCV), ACVR Workshop, 2020, Springer International Publishing, 2020, (Series Volume: 12356). @inproceedings{surrey858587, title = {Multi-channel Transformers for Multi-articulatory Sign Language Translation}, author = {Necati Cihan Camgöz and Oscar Koller and Simon Hadfield and Richard Bowden}, url = {http://epubs.surrey.ac.uk/858587/}, doi = {10.1007/978-3-030-58621-8}, year = {2020}, date = {2020-08-23}, booktitle = {16th European Conference on Computer Vision (ECCV), ACVR Workshop, 2020}, journal = {Proceedings of the 16th European Conference on Computer Vision (ECCV 2020) Part XI}, publisher = {Springer International Publishing}, abstract = {Sign languages use multiple asynchronous information channels (articulators), not just the hands but also the face and body, which computational approaches often ignore. In this paper we tackle the ultiarticulatory sign language translation task and propose a novel multichannel transformer architecture. The proposed architecture allows both the inter and intra contextual relationships between different sign articulators to be modelled within the transformer network itself, while also maintaining channel specific information. We evaluate our approach on the RWTH-PHOENIX-Weather-2014T dataset and report competitive translation performance. Importantly, we overcome the reliance on gloss annotations which underpin other state-of-the-art approaches, thereby removing the need for expensive curated datasets.}, note = {Series Volume: 12356}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Sign languages use multiple asynchronous information channels (articulators), not just the hands but also the face and body, which computational approaches often ignore. In this paper we tackle the ultiarticulatory sign language translation task and propose a novel multichannel transformer architecture. The proposed architecture allows both the inter and intra contextual relationships between different sign articulators to be modelled within the transformer network itself, while also maintaining channel specific information. We evaluate our approach on the RWTH-PHOENIX-Weather-2014T dataset and report competitive translation performance. Importantly, we overcome the reliance on gloss annotations which underpin other state-of-the-art approaches, thereby removing the need for expensive curated datasets. |
26. | Saunders, Ben; Camgöz, Necati Cihan; Bowden, Richard Adversarial Training for Multi-Channel Sign Language Production Journal Article The 31st British Machine Vision Virtual Conference, 2020, (Embargo OK Metadata Pending Awaiting final version published online.). @article{surrey858417, title = {Adversarial Training for Multi-Channel Sign Language Production}, author = {Ben Saunders and Necati Cihan Camgöz and Richard Bowden}, url = {http://epubs.surrey.ac.uk/858417/}, year = {2020}, date = {2020-08-01}, journal = {The 31st British Machine Vision Virtual Conference}, publisher = {British Machine Vision Association}, abstract = {Sign Languages are rich multi-channel languages, requiring articulation of both manual (hands) and non-manual (face and body) features in a precise, intricate manner. Sign Language Production (SLP), the automatic translation from spoken to sign languages, must embody this full sign morphology to be truly understandable by the Deaf community. Previous work has mainly focused on manual feature production, with an under-articulated output caused by regression to the mean. In this paper, we propose an Adversarial Multi-Channel approach to SLP. We frame sign production as a minimax game between a transformer-based Generator and a conditional Discriminator. Our adversarial discriminator evaluates the realism of sign production conditioned on the source text, pushing the generator towards a realistic and articulate output. Additionally, we fully encapsulate sign articulators with the inclusion of non-manual features, producing facial features and mouthing patterns. We evaluate on the challenging RWTH-PHOENIX-Weather-2014T (PHOENIX14T) dataset, and report state-of-the art SLP back-translation performance for manual production. We set new benchmarks for the production of multi-channel sign to underpin future research into realistic SLP.}, note = {Embargo OK Metadata Pending Awaiting final version published online.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Sign Languages are rich multi-channel languages, requiring articulation of both manual (hands) and non-manual (face and body) features in a precise, intricate manner. Sign Language Production (SLP), the automatic translation from spoken to sign languages, must embody this full sign morphology to be truly understandable by the Deaf community. Previous work has mainly focused on manual feature production, with an under-articulated output caused by regression to the mean. In this paper, we propose an Adversarial Multi-Channel approach to SLP. We frame sign production as a minimax game between a transformer-based Generator and a conditional Discriminator. Our adversarial discriminator evaluates the realism of sign production conditioned on the source text, pushing the generator towards a realistic and articulate output. Additionally, we fully encapsulate sign articulators with the inclusion of non-manual features, producing facial features and mouthing patterns. We evaluate on the challenging RWTH-PHOENIX-Weather-2014T (PHOENIX14T) dataset, and report state-of-the art SLP back-translation performance for manual production. We set new benchmarks for the production of multi-channel sign to underpin future research into realistic SLP. |
25. | Saunders, Ben; Camgöz, Necati Cihan; Bowden, Richard Progressive Transformers for End-to-End Sign Language Production Inproceedings Forthcoming European Conference on Computer Vision (ECCV), Forthcoming. @inproceedings{surrey858238, title = {Progressive Transformers for End-to-End Sign Language Production}, author = {Ben Saunders and Necati Cihan Camgöz and Richard Bowden}, url = {http://epubs.surrey.ac.uk/858238/}, year = {2020}, date = {2020-07-01}, booktitle = {European Conference on Computer Vision (ECCV)}, journal = {2020 European Conference on Computer Vision (ECCV)}, abstract = {The goal of automatic Sign Language Production (SLP) is to translate spoken language to a continuous stream of sign language video at a level comparable to a human translator. If this was achievable, then it would revolutionise Deaf hearing communications. Previous work on predominantly isolated SLP has shown the need for architectures that are better suited to the continuous domain of full sign sequences. In this paper, we propose Progressive Transformers, the first SLP model to translate from discrete spoken language sentences to continuous 3D sign pose sequences in an end-to-end manner. A novel counter decoding technique is introduced, that enables continuous sequence generation at training and inference. We present two model configurations, an end-to end network that produces sign direct from text and a stacked network that utilises a gloss intermediary. We also provide several data augmentation processes to overcome the problem of drift and drastically improve the performance of SLP models. We propose a back translation evaluation mechanism for SLP, presenting benchmark quantitative results on the challenging RWTH-PHOENIXWeather- 2014T (PHOENIX14T) dataset and setting baselines for future research. Code available at https://github.com/BenSaunders27/ ProgressiveTransformersSLP.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {inproceedings} } The goal of automatic Sign Language Production (SLP) is to translate spoken language to a continuous stream of sign language video at a level comparable to a human translator. If this was achievable, then it would revolutionise Deaf hearing communications. Previous work on predominantly isolated SLP has shown the need for architectures that are better suited to the continuous domain of full sign sequences. In this paper, we propose Progressive Transformers, the first SLP model to translate from discrete spoken language sentences to continuous 3D sign pose sequences in an end-to-end manner. A novel counter decoding technique is introduced, that enables continuous sequence generation at training and inference. We present two model configurations, an end-to end network that produces sign direct from text and a stacked network that utilises a gloss intermediary. We also provide several data augmentation processes to overcome the problem of drift and drastically improve the performance of SLP models. We propose a back translation evaluation mechanism for SLP, presenting benchmark quantitative results on the challenging RWTH-PHOENIXWeather- 2014T (PHOENIX14T) dataset and setting baselines for future research. Code available at https://github.com/BenSaunders27/ ProgressiveTransformersSLP. |
24. | Jayasingam, Adhuran; Kulupana, Gosala; Galkandage, Chathura; Fernando, Anil Multiple Quantization Parameter Optimization in Versatile Video Coding for 360∘ Videos Journal Article IEEE Transactions on Consumer Electronics, pp. 1-1, 2020, ISSN: 1558-4127. @article{9112244, title = {Multiple Quantization Parameter Optimization in Versatile Video Coding for 360∘ Videos}, author = {Adhuran Jayasingam and Gosala Kulupana and Chathura Galkandage and Anil Fernando}, doi = {10.1109/TCE.2020.3001231}, issn = {1558-4127}, year = {2020}, date = {2020-06-09}, journal = {IEEE Transactions on Consumer Electronics}, pages = {1-1}, abstract = {Consumer Electronics (CE) devices have started to support 360∘ videos in order to improve the perceived video quality by consumers. Higher compression in 360∘ videos not only lead to improved consumer perception, but also efficient energy and storage usage in CE devices. Consequently, efficient compression of such video content is becoming a critical design consideration of CE devices. To this end, this paper first proposes a residual weighting algorithm that utilizes the features of Weighted Craster Parabolic Projection Peak Signal-to-Noise Ratio (WCPPPSNR), to instigate the first stage of magnitude reduction of the residuals. The model parameters associated with the residual weighting algorithm are determined in the initial experimental stage. Next, the paper proposes a Quantization Parameter (QP) optimization technique which introduces the second stage of magnitude reduction for residuals with minimal impact on the overall video quality. The proposed method is tested on the Versatile Video Coding (VVC) test model with All-Intra configuration. In this context, the empirical results demonstrate that the proposed method improves the coding efficiency by 3.34% on average and outperforms the state-of-the-art techniques.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Consumer Electronics (CE) devices have started to support 360∘ videos in order to improve the perceived video quality by consumers. Higher compression in 360∘ videos not only lead to improved consumer perception, but also efficient energy and storage usage in CE devices. Consequently, efficient compression of such video content is becoming a critical design consideration of CE devices. To this end, this paper first proposes a residual weighting algorithm that utilizes the features of Weighted Craster Parabolic Projection Peak Signal-to-Noise Ratio (WCPPPSNR), to instigate the first stage of magnitude reduction of the residuals. The model parameters associated with the residual weighting algorithm are determined in the initial experimental stage. Next, the paper proposes a Quantization Parameter (QP) optimization technique which introduces the second stage of magnitude reduction for residuals with minimal impact on the overall video quality. The proposed method is tested on the Versatile Video Coding (VVC) test model with All-Intra configuration. In this context, the empirical results demonstrate that the proposed method improves the coding efficiency by 3.34% on average and outperforms the state-of-the-art techniques. |
23. | Vowels, Matthew; Camgöz, Necati Cihan; Bowden, Richard Nested VAE:Isolating Common Factors via Weak Supervision Inproceedings Forthcoming 15th IEEE International Conference on Automatic Face and Gesture Recognition, Forthcoming. @inproceedings{surrey854112, title = {Nested VAE:Isolating Common Factors via Weak Supervision}, author = {Matthew Vowels and Necati Cihan Camgöz and Richard Bowden}, url = {http://epubs.surrey.ac.uk/854112/}, year = {2020}, date = {2020-02-01}, booktitle = {15th IEEE International Conference on Automatic Face and Gesture Recognition}, journal = {15th IEEE International Conference on Automatic Face and Gesture Recognition}, abstract = {Fair and unbiased machine learning is an important and active field of research, as decision processes are increasingly driven by models that learn from data. Unfortunately, any biases present in the data may be learned by the model, thereby inappropriately transferring that bias into the decision making process. We identify the connection between the task of bias reduction and that of isolating factors common between domains whilst encouraging domain specific invariance. To isolate the common factors we combine the theory of deep latent variable models with information bottleneck theory for scenarios whereby data may be naturally paired across domains and no additional supervision is required. The result is the Nested Variational AutoEncoder (NestedVAE). Two outer VAEs with shared weights attempt to reconstruct the input and infer a latent space, whilst a nested VAE attempt store construct the latent representation of one image,from the latent representation of its paired image. In so doing,the nested VAE isolates the common latent factors/causes and becomes invariant to unwanted factors that are not shared between paired images. We also propose a new metric to provide a balanced method of evaluating consistency and classifier performance across domains which we refer to as the Adjusted Parity metric. An evaluation of Nested VAE on both domain and attribute invariance, change detection,and learning common factors for the prediction of biological sex demonstrates that NestedVAE significantly outperforms alternative methods.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {inproceedings} } Fair and unbiased machine learning is an important and active field of research, as decision processes are increasingly driven by models that learn from data. Unfortunately, any biases present in the data may be learned by the model, thereby inappropriately transferring that bias into the decision making process. We identify the connection between the task of bias reduction and that of isolating factors common between domains whilst encouraging domain specific invariance. To isolate the common factors we combine the theory of deep latent variable models with information bottleneck theory for scenarios whereby data may be naturally paired across domains and no additional supervision is required. The result is the Nested Variational AutoEncoder (NestedVAE). Two outer VAEs with shared weights attempt to reconstruct the input and infer a latent space, whilst a nested VAE attempt store construct the latent representation of one image,from the latent representation of its paired image. In so doing,the nested VAE isolates the common latent factors/causes and becomes invariant to unwanted factors that are not shared between paired images. We also propose a new metric to provide a balanced method of evaluating consistency and classifier performance across domains which we refer to as the Adjusted Parity metric. An evaluation of Nested VAE on both domain and attribute invariance, change detection,and learning common factors for the prediction of biological sex demonstrates that NestedVAE significantly outperforms alternative methods. |
22. | Camgöz, Necati Cihan; Koller, Oscar; Hadfield, Simon; Bowden, Richard Sign Language Transformers: Joint End-to-end Sign Language Recognition and Translation Inproceedings Forthcoming IEEE Conference on Computer Vision and Pattern Recognition 2020, Forthcoming. @inproceedings{surrey854110, title = {Sign Language Transformers: Joint End-to-end Sign Language Recognition and Translation}, author = {Necati Cihan Camgöz and Oscar Koller and Simon Hadfield and Richard Bowden}, url = {http://epubs.surrey.ac.uk/854110/}, year = {2020}, date = {2020-02-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition 2020}, journal = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020}, abstract = {Prior work on Sign Language Translation has shown that having a mid-level sign gloss representation(effectively recognizing the individual signs) improves the translation performance drastically. In fact, the current state-of-theart in translation requires gloss level tokenization in order to work. We introduce a novel transformer based architecture that jointly learns Continuous Sign Language Recognition and Translation whilebeing trainable in an end-to-end manner. This is achieved by using a Connectionist Temporal Classification(CTC)loss to bind the recognition and translation problems into a single unified architecture. This joint approach does not require any ground-truth timing information,simultaneously solving two co-dependant sequence to sequence learning problems and leads to significant performance gains. We evaluate the recognition and translation performances of our approaches on the challenging RWTHPHOENIX-Weather-2014T(PHOENIX14T)dataset. Wereport state-of-the-art sign language recognition and translation results achieved by our Sign Language Transformers. Our translation net works out perform both sign video to spoken language and gloss to spoken language translation models, in some cases more than doubling the performance (9.58vs. 21.80BLEU-4Score). We also share new baseline translation results using transformer networks for several other text-to-text sign language translation tasks.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {inproceedings} } Prior work on Sign Language Translation has shown that having a mid-level sign gloss representation(effectively recognizing the individual signs) improves the translation performance drastically. In fact, the current state-of-theart in translation requires gloss level tokenization in order to work. We introduce a novel transformer based architecture that jointly learns Continuous Sign Language Recognition and Translation whilebeing trainable in an end-to-end manner. This is achieved by using a Connectionist Temporal Classification(CTC)loss to bind the recognition and translation problems into a single unified architecture. This joint approach does not require any ground-truth timing information,simultaneously solving two co-dependant sequence to sequence learning problems and leads to significant performance gains. We evaluate the recognition and translation performances of our approaches on the challenging RWTHPHOENIX-Weather-2014T(PHOENIX14T)dataset. Wereport state-of-the-art sign language recognition and translation results achieved by our Sign Language Transformers. Our translation net works out perform both sign video to spoken language and gloss to spoken language translation models, in some cases more than doubling the performance (9.58vs. 21.80BLEU-4Score). We also share new baseline translation results using transformer networks for several other text-to-text sign language translation tasks. |
21. | Stoll, Stephanie; Camgöz, Necati Cihan; Hadfield, Simon; Bowden, Richard Text2Sign: Towards Sign Language Production Using Neural Machine Translation and Generative Adversarial Networks. Journal Article International Journal of Computer Vision, (128), pp. 891-908, 2020. @article{surrey853393, title = {Text2Sign: Towards Sign Language Production Using Neural Machine Translation and Generative Adversarial Networks.}, author = {Stephanie Stoll and Necati Cihan Camgöz and Simon Hadfield and Richard Bowden}, url = {http://epubs.surrey.ac.uk/853393/}, doi = {10.1007/s11263-019-01281-2}, year = {2020}, date = {2020-01-02}, journal = {International Journal of Computer Vision}, number = {128}, pages = {891-908}, publisher = {Springer}, abstract = {We present a novel approach to automatic Sign Language Production using recent developments in Neural Machine Translation (NMT), Generative Adversarial Networks, and motion generation. Our system is capable of producing sign videos from spoken language sentences. Contrary to current approaches that are dependent on heavily annotated data, our approach requires minimal gloss and skeletal level annotations for training. We achieve this by breaking down the task into dedicated sub-processes. We first translate spoken language sentences into sign pose sequences by combining an NMT network with a Motion Graph. The resulting pose information is then used to condition a generative model that produces photo realistic sign language video sequences. This is the first approach to continuous sign video generation that does not use a classical graphical avatar. We evaluate the translation abilities of our approach on the PHOENIX14T Sign Language Translation dataset. We set a baseline for text-to-gloss translation, reporting a BLEU-4 score of 16.34/15.26 on dev/test sets. We further demonstrate the video generation capabilities of our approach for both multi-signer and high-definition settings qualitatively and quantitatively using broadcast quality assessment metrics.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We present a novel approach to automatic Sign Language Production using recent developments in Neural Machine Translation (NMT), Generative Adversarial Networks, and motion generation. Our system is capable of producing sign videos from spoken language sentences. Contrary to current approaches that are dependent on heavily annotated data, our approach requires minimal gloss and skeletal level annotations for training. We achieve this by breaking down the task into dedicated sub-processes. We first translate spoken language sentences into sign pose sequences by combining an NMT network with a Motion Graph. The resulting pose information is then used to condition a generative model that produces photo realistic sign language video sequences. This is the first approach to continuous sign video generation that does not use a classical graphical avatar. We evaluate the translation abilities of our approach on the PHOENIX14T Sign Language Translation dataset. We set a baseline for text-to-gloss translation, reporting a BLEU-4 score of 16.34/15.26 on dev/test sets. We further demonstrate the video generation capabilities of our approach for both multi-signer and high-definition settings qualitatively and quantitatively using broadcast quality assessment metrics. |
20. | Hilsmann, Anna; Fechteler, Philipp; Morgenstern, Wieland; Paier, Wolfgang; Feldmann, Ingo; Schreer, Oliver; Eisert, Peter Going beyond Free Viewpoint: Creating Animatable Volumetric Video of Human Performances Journal Article IET Computer Vision, 2020. @article{doi:10.1049/iet-cvi.2019.0786, title = {Going beyond Free Viewpoint: Creating Animatable Volumetric Video of Human Performances}, author = {Anna Hilsmann and Philipp Fechteler and Wieland Morgenstern and Wolfgang Paier and Ingo Feldmann and Oliver Schreer and Peter Eisert}, url = {https://doi.org/10.1049/iet-cvi.2019.0786}, doi = {10.1049/iet-cvi.2019.0786}, year = {2020}, date = {2020-01-01}, journal = {IET Computer Vision}, abstract = {"In this paper, we present an end-to-end pipeline for the creation of high-quality animatable volumetric video content of human performances. Going beyond the application of free-viewpoint volumetric video, we allow re-animation and alteration of an actor’s performance through (i) the enrichment of the captured data with semantics and animation properties and (ii) applying hybrid geometry- and video-based animation methods that allow a direct animation of the high-quality data itself instead of creating an animatable model that resembles the captured data. Semantic enrichment and geometric animation ability are achieved by establishing temporal consistency in the 3D data, followed by an automatic rigging of each frame using a parametric shapeadaptive full human body model. Our hybrid geometry- and video-based animation approaches combine the flexibility of classical CG animation with the realism of real captured data. For pose editing, we exploit the captured data as much as possible and kinematically deform the captured frames to fit a desired pose. Further, we treat the face differently from the body in a hybrid geometry- and video-based animation approach where coarse movements and poses are modeled in the geometry only, while very fine and subtle details in the face, often lacking in purely geometric methods, are captured in video-based textures. These are processed to be interactively combined to form new facial expressions. On top of that, we learn the appearance of regions that are challenging to synthesize, such as the teeth or the eyes, and fill in missing regions realistically in an autoencoder-based approach. This paper covers the full pipeline from capturing and producing high-quality video content, over the enrichment with semantics and deformation properties for re-animation and processing of the data for the final hybrid animation." }, keywords = {}, pubstate = {published}, tppubtype = {article} } "In this paper, we present an end-to-end pipeline for the creation of high-quality animatable volumetric video content of human performances. Going beyond the application of free-viewpoint volumetric video, we allow re-animation and alteration of an actor’s performance through (i) the enrichment of the captured data with semantics and animation properties and (ii) applying hybrid geometry- and video-based animation methods that allow a direct animation of the high-quality data itself instead of creating an animatable model that resembles the captured data. Semantic enrichment and geometric animation ability are achieved by establishing temporal consistency in the 3D data, followed by an automatic rigging of each frame using a parametric shapeadaptive full human body model. Our hybrid geometry- and video-based animation approaches combine the flexibility of classical CG animation with the realism of real captured data. For pose editing, we exploit the captured data as much as possible and kinematically deform the captured frames to fit a desired pose. Further, we treat the face differently from the body in a hybrid geometry- and video-based animation approach where coarse movements and poses are modeled in the geometry only, while very fine and subtle details in the face, often lacking in purely geometric methods, are captured in video-based textures. These are processed to be interactively combined to form new facial expressions. On top of that, we learn the appearance of regions that are challenging to synthesize, such as the teeth or the eyes, and fill in missing regions realistically in an autoencoder-based approach. This paper covers the full pipeline from capturing and producing high-quality video content, over the enrichment with semantics and deformation properties for re-animation and processing of the data for the final hybrid animation." |
19. | Erabadda, Buddhiprabha; Mallikarachchi, Thanuja; Kulupana, Gosala; Fernando, Anil Virtual Frames as Long-Term Reference Frames for HEVC Inter-Prediction Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-2, 2020, ISSN: 2158-4001. @inproceedings{9043054, title = {Virtual Frames as Long-Term Reference Frames for HEVC Inter-Prediction}, author = {Buddhiprabha Erabadda and Thanuja Mallikarachchi and Gosala Kulupana and Anil Fernando}, doi = {10.1109/ICCE46568.2020.9043054}, issn = {2158-4001}, year = {2020}, date = {2020-01-01}, booktitle = {2020 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1-2}, abstract = {High Efficiency Video Coding(HEVC) employs both past or future frames when encoding the current frame in a video sequence. This paper proposes a framework for using virtual reference frames, to achieve increased coding gains in the long-term for repetitive scenes in static camera scenarios.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } High Efficiency Video Coding(HEVC) employs both past or future frames when encoding the current frame in a video sequence. This paper proposes a framework for using virtual reference frames, to achieve increased coding gains in the long-term for repetitive scenes in static camera scenarios. |
18. | Jayasingam, Adhuran; Kulupana, Gosala; Galkandage, Chathura; Fernando, Anil Optimal Distortion Minimization for 360° Video Compression with VVC Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-3, 2020, ISSN: 2158-4001. @inproceedings{9043034, title = {Optimal Distortion Minimization for 360° Video Compression with VVC}, author = {Adhuran Jayasingam and Gosala Kulupana and Chathura Galkandage and Anil Fernando}, doi = {10.1109/ICCE46568.2020.9043034}, issn = {2158-4001}, year = {2020}, date = {2020-01-01}, booktitle = {2020 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1-3}, abstract = {It is vital that video encoders understand and remove the redundant information present in spherically projected 360° videos. To this end, this paper formulates a spherically adaptive objective function that incorporates novel adaptive quantization and weighted residual techniques to reduce the magnitude of redundant information while minimizing the distortion. Furthermore, this paper identifies the residual weighting function and the most optimal quantization parameters that are used to encode 360° videos. The proposed method is tested with test model of Versatile Video Coding (VVC) with All-Intra configurations. The obtained results exhibit an average bit rate saving of 3.18%.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } It is vital that video encoders understand and remove the redundant information present in spherically projected 360° videos. To this end, this paper formulates a spherically adaptive objective function that incorporates novel adaptive quantization and weighted residual techniques to reduce the magnitude of redundant information while minimizing the distortion. Furthermore, this paper identifies the residual weighting function and the most optimal quantization parameters that are used to encode 360° videos. The proposed method is tested with test model of Versatile Video Coding (VVC) with All-Intra configurations. The obtained results exhibit an average bit rate saving of 3.18%. |
17. | Jayasingam, Adhuran; Galkandage, Chathura; Kulupana, Gosala; Fernando, Anil Efficient VVC Intra Coding for 360° Video with Residual Weighting and Adaptive Quantization Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-5, 2020, ISSN: 2158-4001. @inproceedings{9043002, title = {Efficient VVC Intra Coding for 360° Video with Residual Weighting and Adaptive Quantization}, author = {Adhuran Jayasingam and Chathura Galkandage and Gosala Kulupana and Anil Fernando}, doi = {10.1109/ICCE46568.2020.9043002}, issn = {2158-4001}, year = {2020}, date = {2020-01-01}, booktitle = {2020 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1-5}, abstract = {It is vital that video encoders remove redundant information present in the spherically projected 360° video frames. To this end, this paper proposes a novel weighted residual technique and an integrated technique between the proposed weighted residual technique and a state-of-the-art adaptive quantization technique. The proposed methods adapt to the spherical characteristics in order to reduce the magnitude of the redundant information. The tests of the weighted residual technique and the integrated technique with All-Intra configuration using Versatile Video Coding (VVC) test model, produce average bit rate savings between 0.91% and 1.35% with regards to various spherical objective quality metrics.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } It is vital that video encoders remove redundant information present in the spherically projected 360° video frames. To this end, this paper proposes a novel weighted residual technique and an integrated technique between the proposed weighted residual technique and a state-of-the-art adaptive quantization technique. The proposed methods adapt to the spherical characteristics in order to reduce the magnitude of the redundant information. The tests of the weighted residual technique and the integrated technique with All-Intra configuration using Versatile Video Coding (VVC) test model, produce average bit rate savings between 0.91% and 1.35% with regards to various spherical objective quality metrics. |
2019 |
|
16. | Erabadda, Buddhiprabha; Mallikarachchi, Thanuja; Kulupana, Gosala; Fernando, Anil Fast CU Size Decisions for HEVC Inter-Prediction Using Support Vector Machines Inproceedings 2019 27th European Signal Processing Conference (EUSIPCO), pp. 1-5, 2019, ISSN: 2076-1465. @inproceedings{8903081, title = {Fast CU Size Decisions for HEVC Inter-Prediction Using Support Vector Machines}, author = {Buddhiprabha Erabadda and Thanuja Mallikarachchi and Gosala Kulupana and Anil Fernando}, doi = {10.23919/EUSIPCO.2019.8903081}, issn = {2076-1465}, year = {2019}, date = {2019-09-01}, booktitle = {2019 27th European Signal Processing Conference (EUSIPCO)}, pages = {1-5}, abstract = {The brute force rate-distortion optimisation based approach used in the High Efficiency Video Coding(HEVC) encoders to determine the best block partitioning structure for a given content demands an excessive amount of computational resources. In this context, this paper proposes a novel algorithm to reduce the computational complexity of HEVC inter-prediction using Support Vector Machines. The proposed algorithm predicts the Coding Unit (CU) split decision of a particular block enabling the encoder to directly encode the selected block, avoiding the unnecessary evaluation of the remaining CU size combinations. Experimental results demonstrate encoding time reductions of ~58% ~50%2.27%1.89% Bjøntegaard Delta Bit Rate (BDBR) losses for Random Access and Low-Delay B configurations, respectively.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The brute force rate-distortion optimisation based approach used in the High Efficiency Video Coding(HEVC) encoders to determine the best block partitioning structure for a given content demands an excessive amount of computational resources. In this context, this paper proposes a novel algorithm to reduce the computational complexity of HEVC inter-prediction using Support Vector Machines. The proposed algorithm predicts the Coding Unit (CU) split decision of a particular block enabling the encoder to directly encode the selected block, avoiding the unnecessary evaluation of the remaining CU size combinations. Experimental results demonstrate encoding time reductions of ~58% ~50%2.27%1.89% Bjøntegaard Delta Bit Rate (BDBR) losses for Random Access and Low-Delay B configurations, respectively. |
15. | Jayasingam, Adhuran; Kulupana, Gosala; Galkandage, Chathura; Fernando, Anil Evaluation of Modifications to CPPPSNR in 360° Video Quality Assessment Inproceedings 2019 27th European Signal Processing Conference (EUSIPCO), pp. 1-5, 2019, ISSN: 2076-1465. @inproceedings{8903145, title = {Evaluation of Modifications to CPPPSNR in 360° Video Quality Assessment}, author = {Adhuran Jayasingam and Gosala Kulupana and Chathura Galkandage and Anil Fernando}, doi = {10.23919/EUSIPCO.2019.8903145}, issn = {2076-1465}, year = {2019}, date = {2019-09-01}, booktitle = {2019 27th European Signal Processing Conference (EUSIPCO)}, pages = {1-5}, abstract = {360° videos are represented in spherical projection formats and the video quality of such videos is assessed using spherical objective quality metrics. Furthermore, the objective video quality between two different spherical projection formats can be evaluated using Cross projection metrics. Craster parabola, is a 2D cross projection format which is used by Craster Parabolic Peak Signal-to-Noise Ratio (CPPPSNR) metric. The existing CPPPSNR measurements do not consider the subsampling locations during the quality assessment to match the pixel density of a sphere. Nevertheless, it is vitally important to account for the oversampled projection formats and the sphere in order to be compatible with the existing video encoding architectures. To this end, the proposed improvements to the CPPPSNR locates the subsample points during craster parabolic projection and use nearest neighbor interpolation to assign pixels from the craster parabolic projection. Furthermore, in order to compensate the occurrence of oversampling, appropriate weights are applied to the corresponding pixels. The proposed method was tested with Shanghai Jiao Ton University (SJTU) Virtual Reality (VR) sequences for projection conversion. The comparison between Spherical PSNR (SPSNR) and existing CPPPSNR, validate the proposed CPPPSNR as an objective quality metric for cross projections.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } 360° videos are represented in spherical projection formats and the video quality of such videos is assessed using spherical objective quality metrics. Furthermore, the objective video quality between two different spherical projection formats can be evaluated using Cross projection metrics. Craster parabola, is a 2D cross projection format which is used by Craster Parabolic Peak Signal-to-Noise Ratio (CPPPSNR) metric. The existing CPPPSNR measurements do not consider the subsampling locations during the quality assessment to match the pixel density of a sphere. Nevertheless, it is vitally important to account for the oversampled projection formats and the sphere in order to be compatible with the existing video encoding architectures. To this end, the proposed improvements to the CPPPSNR locates the subsample points during craster parabolic projection and use nearest neighbor interpolation to assign pixels from the craster parabolic projection. Furthermore, in order to compensate the occurrence of oversampling, appropriate weights are applied to the corresponding pixels. The proposed method was tested with Shanghai Jiao Ton University (SJTU) Virtual Reality (VR) sequences for projection conversion. The comparison between Spherical PSNR (SPSNR) and existing CPPPSNR, validate the proposed CPPPSNR as an objective quality metric for cross projections. |
14. | Erabadda, Buddhiprabha; Mallikarachchi, Thanuja; Hewage, Chaminda; Fernando, Anil Quality of Experience (QoE)-Aware Fast Coding Unit Size Selection for HEVC Intra-prediction Journal Article Future Internet, 11 (8), pp. 175, 2019, ISSN: 1999-5903. @article{Erabadda_2019, title = {Quality of Experience (QoE)-Aware Fast Coding Unit Size Selection for HEVC Intra-prediction}, author = {Buddhiprabha Erabadda and Thanuja Mallikarachchi and Chaminda Hewage and Anil Fernando}, url = {http://dx.doi.org/10.3390/fi11080175}, doi = {10.3390/fi11080175}, issn = {1999-5903}, year = {2019}, date = {2019-08-01}, journal = {Future Internet}, volume = {11}, number = {8}, pages = {175}, publisher = {MDPI AG}, abstract = {The exorbitant increase in the computational complexity of modern video coding standards, such as High Efficiency Video Coding (HEVC), is a compelling challenge for resource-constrained consumer electronic devices. For instance, the brute force evaluation of all possible combinations of available coding modes and quadtree-based coding structure in HEVC to determine the optimum set of coding parameters for a given content demand a substantial amount of computational and energy resources. Thus, the resource requirements for real time operation of HEVC has become a contributing factor towards the Quality of Experience (QoE) of the end users of emerging multimedia and future internet applications. In this context, this paper proposes a content-adaptive Coding Unit (CU) size selection algorithm for HEVC intra-prediction. The proposed algorithm builds content-specific weighted Support Vector Machine (SVM) models in real time during the encoding process, to provide an early estimate of CU size for a given content, avoiding the brute force evaluation of all possible coding mode combinations in HEVC. The experimental results demonstrate an average encoding time reduction of 52.38%, with an average Bjøntegaard Delta Bit Rate (BDBR) increase of 1.19% compared to the HM16.1 reference encoder. Furthermore, the perceptual visual quality assessments conducted through Video Quality Metric (VQM) show minimal visual quality impact on the reconstructed videos of the proposed algorithm compared to state-of-the-art approaches.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The exorbitant increase in the computational complexity of modern video coding standards, such as High Efficiency Video Coding (HEVC), is a compelling challenge for resource-constrained consumer electronic devices. For instance, the brute force evaluation of all possible combinations of available coding modes and quadtree-based coding structure in HEVC to determine the optimum set of coding parameters for a given content demand a substantial amount of computational and energy resources. Thus, the resource requirements for real time operation of HEVC has become a contributing factor towards the Quality of Experience (QoE) of the end users of emerging multimedia and future internet applications. In this context, this paper proposes a content-adaptive Coding Unit (CU) size selection algorithm for HEVC intra-prediction. The proposed algorithm builds content-specific weighted Support Vector Machine (SVM) models in real time during the encoding process, to provide an early estimate of CU size for a given content, avoiding the brute force evaluation of all possible coding mode combinations in HEVC. The experimental results demonstrate an average encoding time reduction of 52.38%, with an average Bjøntegaard Delta Bit Rate (BDBR) increase of 1.19% compared to the HM16.1 reference encoder. Furthermore, the perceptual visual quality assessments conducted through Video Quality Metric (VQM) show minimal visual quality impact on the reconstructed videos of the proposed algorithm compared to state-of-the-art approaches. |
13. | Koller, Oscar; Camgöz, Necati Cihan; Ney, Hermann; Bowden, Richard Weakly Supervised Learning with Multi-Stream CNN-LSTM-HMMs to Discover Sequential Parallelism in Sign Language Videos Journal Article IEEE Transactions on Pattern Analysis and Machine Intelligence, pp. 1–1, 2019. @article{surrey851776, title = {Weakly Supervised Learning with Multi-Stream CNN-LSTM-HMMs to Discover Sequential Parallelism in Sign Language Videos}, author = {Oscar Koller and Necati Cihan Camgöz and Hermann Ney and Richard Bowden}, url = {http://epubs.surrey.ac.uk/851776/}, doi = {10.1109/TPAMI.2019.2911077}, year = {2019}, date = {2019-04-01}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, pages = {1--1}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {In this work we present a new approach to the field of weakly supervised learning in the video domain. Our method is relevant to sequence learning problems which can be split up into sub-problems that occur in parallel. Here, we experiment with sign language data. The approach exploits sequence constraints within each independent stream and combines them by explicitly imposing synchronisation points to make use of parallelism that all sub-problems share. We do this with multi-stream HMMs while adding intermediate synchronisation constraints among the streams. We embed powerful CNN-LSTM models in each HMM stream following the hybrid approach. This allows the discovery of attributes which on their own lack sufficient discriminative power to be identified. We apply the approach to the domain of sign language recognition exploiting the sequential parallelism to learn sign language, mouth shape and hand shape classifiers. We evaluate the classifiers on three publicly available benchmark data sets featuring challenging real-life sign language with over 1000 classes, full sentence based lip-reading and articulated hand shape recognition on a fine-grained hand shape taxonomy featuring over 60 different hand shapes. We clearly outperform the state-of-the-art on all data sets and observe significantly faster convergence using the parallel alignment approach.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this work we present a new approach to the field of weakly supervised learning in the video domain. Our method is relevant to sequence learning problems which can be split up into sub-problems that occur in parallel. Here, we experiment with sign language data. The approach exploits sequence constraints within each independent stream and combines them by explicitly imposing synchronisation points to make use of parallelism that all sub-problems share. We do this with multi-stream HMMs while adding intermediate synchronisation constraints among the streams. We embed powerful CNN-LSTM models in each HMM stream following the hybrid approach. This allows the discovery of attributes which on their own lack sufficient discriminative power to be identified. We apply the approach to the domain of sign language recognition exploiting the sequential parallelism to learn sign language, mouth shape and hand shape classifiers. We evaluate the classifiers on three publicly available benchmark data sets featuring challenging real-life sign language with over 1000 classes, full sentence based lip-reading and articulated hand shape recognition on a fine-grained hand shape taxonomy featuring over 60 different hand shapes. We clearly outperform the state-of-the-art on all data sets and observe significantly faster convergence using the parallel alignment approach. |
12. | Fechteler, Philipp; Hilsmann, Anna; Eisert, Peter Markerless Multiview Motion Capture with 3D Shape Model Adaptation Journal Article Computer Graphics Forum, 2019. @article{1, title = {Markerless Multiview Motion Capture with 3D Shape Model Adaptation}, author = {Philipp Fechteler and Anna Hilsmann and Peter Eisert}, url = {https://onlinelibrary.wiley.com/doi/epdf/10.1111/cgf.13608}, doi = {10.1111/cgf.13608}, year = {2019}, date = {2019-03-18}, journal = {Computer Graphics Forum}, abstract = {In this paper, we address simultaneous markerless motion and shape capture from 3D input meshes of partial views onto a moving subject. We exploit a computer graphics model based on kinematic skinning as template tracking model. This template model consists of vertices, joints and skinning weights learned a priori from registered full‐body scans, representing true human shape and kinematics‐based shape deformations. Two data‐driven priors are used together with a set of constraints and cues for setting up sufficient correspondences. A Gaussian mixture model‐based pose prior of successive joint configurations is learned to soft‐constrain the attainable pose space to plausible human poses. To make the shape adaptation robust to outliers and non‐visible surface regions and to guide the shape adaptation towards realistically appearing human shapes, we use a mesh‐Laplacian‐based shape prior. Both priors are learned/extracted from the training set of the template model learning phase. The output is a model adapted to the captured subject with respect to shape and kinematic skeleton as well as the animation parameters to resemble the observed movements. With example applications, we demonstrate the benefit of such footage. Experimental evaluations on publicly available datasets show the achieved natural appearance and accuracy.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this paper, we address simultaneous markerless motion and shape capture from 3D input meshes of partial views onto a moving subject. We exploit a computer graphics model based on kinematic skinning as template tracking model. This template model consists of vertices, joints and skinning weights learned a priori from registered full‐body scans, representing true human shape and kinematics‐based shape deformations. Two data‐driven priors are used together with a set of constraints and cues for setting up sufficient correspondences. A Gaussian mixture model‐based pose prior of successive joint configurations is learned to soft‐constrain the attainable pose space to plausible human poses. To make the shape adaptation robust to outliers and non‐visible surface regions and to guide the shape adaptation towards realistically appearing human shapes, we use a mesh‐Laplacian‐based shape prior. Both priors are learned/extracted from the training set of the template model learning phase. The output is a model adapted to the captured subject with respect to shape and kinematic skeleton as well as the animation parameters to resemble the observed movements. With example applications, we demonstrate the benefit of such footage. Experimental evaluations on publicly available datasets show the achieved natural appearance and accuracy. |
11. | Udora, Carl C; Mir, Junaid; Galkandage, Chathura; Fernando, Anil QoE Modelling of High Dynamic Range Video Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. @inproceedings{surrey852000, title = {QoE Modelling of High Dynamic Range Video}, author = {Carl C Udora and Junaid Mir and Chathura Galkandage and Anil Fernando}, url = {http://epubs.surrey.ac.uk/852000/}, doi = {10.1109/ICCE.2019.8662122}, year = {2019}, date = {2019-03-01}, booktitle = {2019 IEEE International Conference on Consumer Electronics (ICCE)}, journal = {Proceedings of the 2019 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1--2}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {The level of user satisfaction has no standard way of measuring for HDR video content due to the proven difficulty of building HDR quality assessment metrics. To overcome this limitation, Quality of Experience (QoE) modelling of HDR video has been proposed to find a robust and accurate HDR video QoE metric. The proposed model is the first attempt towards assessing and devising a non-reference quality metric for HDR video. It is based on finding the correlation between the HDR video features and the subjective test results. The proposed model achieves a significant correlation score of 0.724 with the subjective results.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The level of user satisfaction has no standard way of measuring for HDR video content due to the proven difficulty of building HDR quality assessment metrics. To overcome this limitation, Quality of Experience (QoE) modelling of HDR video has been proposed to find a robust and accurate HDR video QoE metric. The proposed model is the first attempt towards assessing and devising a non-reference quality metric for HDR video. It is based on finding the correlation between the HDR video features and the subjective test results. The proposed model achieves a significant correlation score of 0.724 with the subjective results. |
10. | Erabadda, Buddhiprabha; Mallikarachchi, Thanuja; Kulupana, Gosala; Fernando, Anil Content Adaptive Fast CU Size Selection for HEVC Intra-Prediction Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. @inproceedings{surrey851998, title = {Content Adaptive Fast CU Size Selection for HEVC Intra-Prediction}, author = {Buddhiprabha Erabadda and Thanuja Mallikarachchi and Gosala Kulupana and Anil Fernando}, url = {http://epubs.surrey.ac.uk/851998/}, doi = {10.1109/ICCE.2019.8662119}, year = {2019}, date = {2019-03-01}, booktitle = {2019 IEEE International Conference on Consumer Electronics (ICCE)}, journal = {Proceedings of the 2019 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1--2}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {This paper proposes a content adaptive fast CU size selection algorithm for HEVC intra-prediction using weighted support vector machines. The proposed algorithm demonstrates an average encoding time reduction of 52.38% with 1.19% average BDBR increase compared to HM16.1 reference encoder.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper proposes a content adaptive fast CU size selection algorithm for HEVC intra-prediction using weighted support vector machines. The proposed algorithm demonstrates an average encoding time reduction of 52.38% with 1.19% average BDBR increase compared to HM16.1 reference encoder. |
9. | Rakngan, Sarat; Mallikarachchi, Thanuja; Fernando, Anil An adaptive video streaming framework for Scalable HEVC (SHVC) standard Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. @inproceedings{surrey851996, title = {An adaptive video streaming framework for Scalable HEVC (SHVC) standard}, author = {Sarat Rakngan and Thanuja Mallikarachchi and Anil Fernando}, url = {http://epubs.surrey.ac.uk/851996/}, doi = {10.1109/ICCE.2019.8662075}, year = {2019}, date = {2019-03-01}, booktitle = {2019 IEEE International Conference on Consumer Electronics (ICCE)}, journal = {Proceedings of the 2019 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1--2}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {This paper presents an implementation of a Media Aware Network Element (MANE) for dynamic video content adaptation in Scalable HEVC (SHVC) video streaming. The experimental results discuss the varying quality-to-playback time ratio and decoding power consumption with random access period in SHVC encoding under fluctuating and persistent network bandwidth conditions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents an implementation of a Media Aware Network Element (MANE) for dynamic video content adaptation in Scalable HEVC (SHVC) video streaming. The experimental results discuss the varying quality-to-playback time ratio and decoding power consumption with random access period in SHVC encoding under fluctuating and persistent network bandwidth conditions. |
8. | Rochette, Guillaume; Russell, Chris; Bowden, Richard Weakly-Supervised 3D Pose Estimation from a Single Image using Multi-View Consistency Inproceedings 30th British Machine Vision Conference (BMVC 2019), BMVC, 2019. @inproceedings{surrey852639, title = {Weakly-Supervised 3D Pose Estimation from a Single Image using Multi-View Consistency}, author = {Guillaume Rochette and Chris Russell and Richard Bowden}, url = {http://epubs.surrey.ac.uk/852639/}, year = {2019}, date = {2019-01-01}, booktitle = {30th British Machine Vision Conference (BMVC 2019)}, journal = {Proceedings of the 30th British Machine Vision Conference (BMVC 2019)}, publisher = {BMVC}, abstract = {We present a novel data-driven regularizer for weakly-supervised learning of 3D human pose estimation that eliminates the drift problem that affects existing approaches. We do this by moving the stereo reconstruction problem into the loss of the network itself. This avoids the need to reconstruct 3D data prior to training and unlike previous semi-supervised approaches, avoids the need for a warm-up period of supervised training. The conceptual and implementational simplicity of our approach is fundamental to its appeal. Not only is it straightforward to augment many weakly-supervised approaches with our additional re-projection based loss, but it is obvious how it shapes reconstructions and prevents drift. As such we believe it will be a valuable tool for any researcher working in weakly-supervised 3D reconstruction. Evaluating on Panoptic, the largest multi-camera and markerless dataset available, we obtain an accuracy that is essentially indistinguishable from a strongly-supervised approach making full use of 3D groundtruth in training.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We present a novel data-driven regularizer for weakly-supervised learning of 3D human pose estimation that eliminates the drift problem that affects existing approaches. We do this by moving the stereo reconstruction problem into the loss of the network itself. This avoids the need to reconstruct 3D data prior to training and unlike previous semi-supervised approaches, avoids the need for a warm-up period of supervised training. The conceptual and implementational simplicity of our approach is fundamental to its appeal. Not only is it straightforward to augment many weakly-supervised approaches with our additional re-projection based loss, but it is obvious how it shapes reconstructions and prevents drift. As such we believe it will be a valuable tool for any researcher working in weakly-supervised 3D reconstruction. Evaluating on Panoptic, the largest multi-camera and markerless dataset available, we obtain an accuracy that is essentially indistinguishable from a strongly-supervised approach making full use of 3D groundtruth in training. |
7. | Morgenstern, Wieland; Hilsmann, Anna; Eisert, Peter Progressive Non-Rigid Registration of Temporal Mesh Sequences Inproceedings European Conference on Visual Media Production, Association for Computing Machinery, London, United Kingdom, 2019, ISBN: 9781450370035, (Best Paper Award). BibTeX | Links: @inproceedings{10.1145/3359998.3369411, title = {Progressive Non-Rigid Registration of Temporal Mesh Sequences}, author = {Wieland Morgenstern and Anna Hilsmann and Peter Eisert}, url = {https://doi.org/10.1145/3359998.3369411}, doi = {10.1145/3359998.3369411}, isbn = {9781450370035}, year = {2019}, date = {2019-01-01}, booktitle = {European Conference on Visual Media Production}, publisher = {Association for Computing Machinery}, address = {London, United Kingdom}, series = {CVMP ’19}, note = {Best Paper Award}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2018 |
|
6. | Erabadda, Buddhiprabha; Mallikarachchi, Thanuja; Kulupana, Gosala; Fernando, Anil Machine Learning Approaches for Intra-Prediction in HEVC Inproceedings 2018 IEEE 7th Global Conference on Consumer Electronics (GCCE), pp. 206–209, Institute of Electrical and Electronics Engineers (IEEE), 2018. @inproceedings{surrey851999, title = {Machine Learning Approaches for Intra-Prediction in HEVC}, author = {Buddhiprabha Erabadda and Thanuja Mallikarachchi and Gosala Kulupana and Anil Fernando}, url = {http://epubs.surrey.ac.uk/851999/}, doi = {10.1109/GCCE.2018.8574648}, year = {2018}, date = {2018-12-01}, booktitle = {2018 IEEE 7th Global Conference on Consumer Electronics (GCCE)}, journal = {Proceedings of the 2018 IEEE 7th Global Conference on Consumer Electronics (GCCE)}, pages = {206--209}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {The use of machine learning techniques for encoding complexity reduction in recent video coding standards such as High Efficiency Video Coding (HEVC) has received prominent attention in the recent past. Yet, the dynamically changing nature of the video contents makes it evermore challenging to use rigid traditional inference models for predicting the encoding decisions for a given content. In this context, this paper investigates the resulting implications on the coding efficiency and the encoding complexity, when using offline trained and online trained machine-learning models for coding unit size selection in the HEVC intra-prediction. The experimental results demonstrate that the ground truth encoding statistics of the content being encoded, is crucial to the efficient encoding decision prediction when using machine learning based prediction models.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The use of machine learning techniques for encoding complexity reduction in recent video coding standards such as High Efficiency Video Coding (HEVC) has received prominent attention in the recent past. Yet, the dynamically changing nature of the video contents makes it evermore challenging to use rigid traditional inference models for predicting the encoding decisions for a given content. In this context, this paper investigates the resulting implications on the coding efficiency and the encoding complexity, when using offline trained and online trained machine-learning models for coding unit size selection in the HEVC intra-prediction. The experimental results demonstrate that the ground truth encoding statistics of the content being encoded, is crucial to the efficient encoding decision prediction when using machine learning based prediction models. |
5. | Kulupana, Gosala; Talagala, Dumidu S; Arachchi, Hemantha Kodikara; Fernando, Anil End User Video Quality Prediction and Coding Parameters Selection at the Encoder for Robust HEVC Video Transmission Journal Article IEEE Transactions on Circuits and Systems for Video Technology, 2018. @article{2, title = {End User Video Quality Prediction and Coding Parameters Selection at the Encoder for Robust HEVC Video Transmission}, author = {Gosala Kulupana and Dumidu S. Talagala and Hemantha Kodikara Arachchi and Anil Fernando }, url = {http://epubs.surrey.ac.uk/850098/}, doi = {10.1109/tcsvt.2018.2879956}, year = {2018}, date = {2018-11-09}, journal = {IEEE Transactions on Circuits and Systems for Video Technology}, abstract = {Along with the rapid increase in the availability for high quality video formats such as HD (High Definition), UHD (Ultra HD) and HDR (High Dynamic Range), a huge demand for data rates during their transmission has become inevitable. Consequently, the role of video compression techniques has become crucially important in the process of mitigating the data rate requirements. Even though the latest video codec HEVC (High Efficiency Video Coding) has succeeded in significantly reducing the data rate compared to its immediate predecessor H.264/AVC (Advanced Video Coding), the HEVC coded videos in the meantime have become even more vulnerable to network impairments. Therefore, it is equally important to assess the consumers’ perceived quality degradation prior to transmitting HEVC coded videos over an error prone network, and to include error resilient features so as to minimize the adverse effects those impairments. To this end, this paper proposes a probabilistic model which accurately predicts the overall distortion of the decoded video at the encoder followed by an accurate QP-λ relationship which can be used in the RDO (Rate Distortion Optimization) process. During the derivation process of the probabilistic model, the impacts from the motion vectors, the pixels in the reference frames and the clipping operations are accounted and consequently the model is capable of minimizing the prediction error as low as 3.11% whereas the state-of-the-art methods can’t reach below 20.08% under identical conditions. Furthermore, the enhanced RDO process has resulted in 21.41%- 43.59% improvement in the BD-rate compared to the state-of-the-art error resilient algorithms.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Along with the rapid increase in the availability for high quality video formats such as HD (High Definition), UHD (Ultra HD) and HDR (High Dynamic Range), a huge demand for data rates during their transmission has become inevitable. Consequently, the role of video compression techniques has become crucially important in the process of mitigating the data rate requirements. Even though the latest video codec HEVC (High Efficiency Video Coding) has succeeded in significantly reducing the data rate compared to its immediate predecessor H.264/AVC (Advanced Video Coding), the HEVC coded videos in the meantime have become even more vulnerable to network impairments. Therefore, it is equally important to assess the consumers’ perceived quality degradation prior to transmitting HEVC coded videos over an error prone network, and to include error resilient features so as to minimize the adverse effects those impairments. To this end, this paper proposes a probabilistic model which accurately predicts the overall distortion of the decoded video at the encoder followed by an accurate QP-λ relationship which can be used in the RDO (Rate Distortion Optimization) process. During the derivation process of the probabilistic model, the impacts from the motion vectors, the pixels in the reference frames and the clipping operations are accounted and consequently the model is capable of minimizing the prediction error as low as 3.11% whereas the state-of-the-art methods can’t reach below 20.08% under identical conditions. Furthermore, the enhanced RDO process has resulted in 21.41%- 43.59% improvement in the BD-rate compared to the state-of-the-art error resilient algorithms. |
4. | Stoll, Stephanie; Camgöz, Necati Cihan; Hadfield, Simon; Bowden, Richard Sign Language Production using Neural Machine Translation and Generative Adversarial Networks Inproceedings 29th British Machine Vision Conference (BMVC 2018), British Machine Vision Association, 2018. @inproceedings{surrey848809, title = {Sign Language Production using Neural Machine Translation and Generative Adversarial Networks}, author = {Stephanie Stoll and Necati Cihan Camgöz and Simon Hadfield and Richard Bowden}, url = {http://epubs.surrey.ac.uk/848809/}, year = {2018}, date = {2018-09-01}, booktitle = {29th British Machine Vision Conference (BMVC 2018)}, journal = {Proceedings of the 29th British Machine Vision Conference (BMVC 2018)}, publisher = {British Machine Vision Association}, abstract = {We present a novel approach to automatic Sign Language Production using stateof- the-art Neural Machine Translation (NMT) and Image Generation techniques. Our system is capable of producing sign videos from spoken language sentences. Contrary to current approaches that are dependent on heavily annotated data, our approach requires minimal gloss and skeletal level annotations for training. We achieve this by breaking down the task into dedicated sub-processes. We first translate spoken language sentences into sign gloss sequences using an encoder-decoder network. We then find a data driven mapping between glosses and skeletal sequences. We use the resulting pose information to condition a generative model that produces sign language video sequences. We evaluate our approach on the recently released PHOENIX14T Sign Language Translation dataset. We set a baseline for text-to-gloss translation, reporting a BLEU-4 score of 16.34/15.26 on dev/test sets. We further demonstrate the video generation capabilities of our approach by sharing qualitative results of generated sign sequences given their skeletal correspondence.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We present a novel approach to automatic Sign Language Production using stateof- the-art Neural Machine Translation (NMT) and Image Generation techniques. Our system is capable of producing sign videos from spoken language sentences. Contrary to current approaches that are dependent on heavily annotated data, our approach requires minimal gloss and skeletal level annotations for training. We achieve this by breaking down the task into dedicated sub-processes. We first translate spoken language sentences into sign gloss sequences using an encoder-decoder network. We then find a data driven mapping between glosses and skeletal sequences. We use the resulting pose information to condition a generative model that produces sign language video sequences. We evaluate our approach on the recently released PHOENIX14T Sign Language Translation dataset. We set a baseline for text-to-gloss translation, reporting a BLEU-4 score of 16.34/15.26 on dev/test sets. We further demonstrate the video generation capabilities of our approach by sharing qualitative results of generated sign sequences given their skeletal correspondence. |
3. | Kulupana, Gosala; Talagala, Dumidu S; Fernando, Anil; Arachchi, Hemantha Kodikara Bit allocation and encoding parameter selection for rate-controlled error resilient HEVC video encoding Inproceedings 2018 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–4, Institute of Electrical and Electronics Engineers (IEEE), 2018. @inproceedings{surrey851997, title = {Bit allocation and encoding parameter selection for rate-controlled error resilient HEVC video encoding}, author = {Gosala Kulupana and Dumidu S. Talagala and Anil Fernando and Hemantha Kodikara Arachchi}, url = {http://epubs.surrey.ac.uk/851997/}, doi = {10.1109/ICCE.2018.8326287}, year = {2018}, date = {2018-03-01}, booktitle = {2018 IEEE International Conference on Consumer Electronics (ICCE)}, journal = {Proceedings of the 2018 IEEE International Conference on Consumer Electronics (ICCE)}, pages = {1--4}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {Even though the latest video compression techniques such as High Efficiency Video coding (HEVC) have succeeded in significantly alleviating the bandwidth consumption during high resolution video transmission, they have become severely susceptible to transmission errors. Overcoming the resulting temporal impact of the transmission errors on the decoded video requires efficient error resilient schemes that can introduce robustness features to the coded video in order to mitigate the negative impact on the viewer. To this end, this paper proposes a rate-controlled error resilient bit allocation scheme, together with an encoding parameter selection process, to adaptively determine the most robust video coding parameters and the decoder error concealment operations during the encoding itself. Consequently, the proposed method has demonstrated 0.48dB-0.62dB PSNR gain over the state-of-the art methods at the same bit rate.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Even though the latest video compression techniques such as High Efficiency Video coding (HEVC) have succeeded in significantly alleviating the bandwidth consumption during high resolution video transmission, they have become severely susceptible to transmission errors. Overcoming the resulting temporal impact of the transmission errors on the decoded video requires efficient error resilient schemes that can introduce robustness features to the coded video in order to mitigate the negative impact on the viewer. To this end, this paper proposes a rate-controlled error resilient bit allocation scheme, together with an encoding parameter selection process, to adaptively determine the most robust video coding parameters and the decoder error concealment operations during the encoding itself. Consequently, the proposed method has demonstrated 0.48dB-0.62dB PSNR gain over the state-of-the art methods at the same bit rate. |
2. | Mallikarachchi, Thanuja; Talagala, Dumidu S; Arachchi, Hemantha Kodikara; Fernando, Anil Decoding-Complexity-Aware HEVC Encoding Using a Complexity-Rate-Distortion Model Journal Article IEEE Transactions on Consumer Electronics, 64 (1), pp. 35–43, 2018. @article{surrey846056, title = {Decoding-Complexity-Aware HEVC Encoding Using a Complexity-Rate-Distortion Model}, author = {Thanuja Mallikarachchi and Dumidu S. Talagala and Hemantha Kodikara Arachchi and Anil Fernando}, url = {http://epubs.surrey.ac.uk/846056/}, doi = {10.1109/TCE.2018.2810479}, year = {2018}, date = {2018-03-01}, journal = {IEEE Transactions on Consumer Electronics}, volume = {64}, number = {1}, pages = {35--43}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, abstract = {The energy consumption of Consumer Electronic (CE) devices during media playback is inexorably linked to the computational complexity of decoding compressed video. Reducing a CE devicetextquoterights the energy consumption is therefore becoming ever more challenging with the increasing video resolutions and the complexity of the video coding algorithms. To this end, this paper proposes a framework that alters the video bit stream to reduce the decoding complexity and simultaneously limits the impact on the coding efficiency. In this context, this paper (i) first performs an analysis to determine the trade-off between the decoding complexity, video quality and bit rate with respect to a reference decoder implementation on a General Purpose Processor (GPP) architecture. Thereafter, (ii) a novel generic decoding complexity-aware video coding algorithm is proposed to generate decoding complexity-rate-distortion optimized High Efficiency Video Coding (HEVC) bit streams. The experimental results reveal that the bit streams generated by the proposed algorithm achieve 29.43% and 13.22% decoding complexity reductions for a similar video quality with minimal coding efficiency impact compared to the state-of-the-art approaches when applied to the HM16.0 and openHEVC decoder implementations, respectively. In addition, analysis of the energy consumption behavior for the same scenarios reveal up to 20% energy consumption reductions while achieving a similar video quality to that of HM 16.0 encoded HEVC bit streams.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The energy consumption of Consumer Electronic (CE) devices during media playback is inexorably linked to the computational complexity of decoding compressed video. Reducing a CE devicetextquoterights the energy consumption is therefore becoming ever more challenging with the increasing video resolutions and the complexity of the video coding algorithms. To this end, this paper proposes a framework that alters the video bit stream to reduce the decoding complexity and simultaneously limits the impact on the coding efficiency. In this context, this paper (i) first performs an analysis to determine the trade-off between the decoding complexity, video quality and bit rate with respect to a reference decoder implementation on a General Purpose Processor (GPP) architecture. Thereafter, (ii) a novel generic decoding complexity-aware video coding algorithm is proposed to generate decoding complexity-rate-distortion optimized High Efficiency Video Coding (HEVC) bit streams. The experimental results reveal that the bit streams generated by the proposed algorithm achieve 29.43% and 13.22% decoding complexity reductions for a similar video quality with minimal coding efficiency impact compared to the state-of-the-art approaches when applied to the HM16.0 and openHEVC decoder implementations, respectively. In addition, analysis of the energy consumption behavior for the same scenarios reveal up to 20% energy consumption reductions while achieving a similar video quality to that of HM 16.0 encoded HEVC bit streams. |
1. | Camgöz, Necati Cihan; Hadfield, Simon; Koller, Oscar; Ney, Hermann; Bowden, Richard Neural Sign Language Translation Inproceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2018, pp. 7784–7793, IEEE, 2018. @inproceedings{surrey846335, title = {Neural Sign Language Translation}, author = {Necati Cihan Camgöz and Simon Hadfield and Oscar Koller and Hermann Ney and Richard Bowden}, url = {http://epubs.surrey.ac.uk/846335/}, doi = {10.1109/CVPR.2018.00812}, year = {2018}, date = {2018-01-01}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2018}, journal = {Proceedings CVPR 2018}, pages = {7784--7793}, publisher = {IEEE}, abstract = {Sign Language Recognition (SLR) has been an active research field for the last two decades. However, most research to date has considered SLR as a naive gesture recognition problem. SLR seeks to recognize a sequence of continuous signs but neglects the underlying rich grammatical and linguistic structures of sign language that differ from spoken language. In contrast, we introduce the Sign Language Translation (SLT) problem. Here, the objective is to generate spoken language translations from sign language videos, taking into account the different word orders and grammar. We formalize SLT in the framework of Neural Machine Translation (NMT) for both end-to-end and pretrained settings (using expert knowledge). This allows us to jointly learn the spatial representations, the underlying language model, and the mapping between sign and spoken language. To evaluate the performance of Neural SLT, we collected the first publicly available Continuous SLT dataset, RWTHPHOENIX-Weather 2014T1. It provides spoken language translations and gloss level annotations for German Sign Language videos of weather broadcasts. Our dataset contains over .95M frames with ensuremath>67K signs from a sign vocabulary of ensuremath>1K and ensuremath>99K words from a German vocabulary of ensuremath>2.8K. We report quantitative and qualitative results for various SLT setups to underpin future research in this newly established field. The upper bound for translation performance is calculated at 19.26 BLEU-4, while our end-to-end frame-level and gloss-level tokenization networks were able to achieve 9.58 and 18.13 respectively.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Sign Language Recognition (SLR) has been an active research field for the last two decades. However, most research to date has considered SLR as a naive gesture recognition problem. SLR seeks to recognize a sequence of continuous signs but neglects the underlying rich grammatical and linguistic structures of sign language that differ from spoken language. In contrast, we introduce the Sign Language Translation (SLT) problem. Here, the objective is to generate spoken language translations from sign language videos, taking into account the different word orders and grammar. We formalize SLT in the framework of Neural Machine Translation (NMT) for both end-to-end and pretrained settings (using expert knowledge). This allows us to jointly learn the spatial representations, the underlying language model, and the mapping between sign and spoken language. To evaluate the performance of Neural SLT, we collected the first publicly available Continuous SLT dataset, RWTHPHOENIX-Weather 2014T1. It provides spoken language translations and gloss level annotations for German Sign Language videos of weather broadcasts. Our dataset contains over .95M frames with ensuremath>67K signs from a sign vocabulary of ensuremath>1K and ensuremath>99K words from a German vocabulary of ensuremath>2.8K. We report quantitative and qualitative results for various SLT setups to underpin future research in this newly established field. The upper bound for translation performance is calculated at 19.26 BLEU-4, while our end-to-end frame-level and gloss-level tokenization networks were able to achieve 9.58 and 18.13 respectively. |
Publications
2020 |
|
31. | Interactive Facial Animation with Deep Neural Networks Journal Article Forthcoming IET Computer Vision, 2020., Forthcoming. |
30. | Media Technology Development: A Team Study with User Focus Inproceedings Forthcoming Languages & the Media 2020, the 13th International Conference on Language Transfer in Audiovisual Media, Forthcoming. |
29. | Development of a Multilingual Questionnaire for the Deaf Community – Guidelines and Challenges Inproceedings Stephanidis, Constantine; Antona, Margherita; Gao, Qin; Zhou, Jia (Ed.): HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design, pp. 103–113, Springer International Publishing, Cham, 2020, ISBN: 978-3-030-60149-2. |
28. | User-Centered Design for Accessibility in Media Content – Sign Language and Virtual Signer Inproceedings Stephanidis, Constantine; Antona, Margherita; Gao, Qin; Zhou, Jia (Ed.): HCI International 2020 -- Late Breaking Papers: Universal Access and Inclusive Design, pp. 126–143, Springer International Publishing, Cham, 2020, ISBN: 978-3-030-60149-2. |
27. | Multi-channel Transformers for Multi-articulatory Sign Language Translation Inproceedings 16th European Conference on Computer Vision (ECCV), ACVR Workshop, 2020, Springer International Publishing, 2020, (Series Volume: 12356). |
26. | Adversarial Training for Multi-Channel Sign Language Production Journal Article The 31st British Machine Vision Virtual Conference, 2020, (Embargo OK Metadata Pending Awaiting final version published online.). |
25. | Progressive Transformers for End-to-End Sign Language Production Inproceedings Forthcoming European Conference on Computer Vision (ECCV), Forthcoming. |
24. | Multiple Quantization Parameter Optimization in Versatile Video Coding for 360∘ Videos Journal Article IEEE Transactions on Consumer Electronics, pp. 1-1, 2020, ISSN: 1558-4127. |
23. | Nested VAE:Isolating Common Factors via Weak Supervision Inproceedings Forthcoming 15th IEEE International Conference on Automatic Face and Gesture Recognition, Forthcoming. |
22. | Sign Language Transformers: Joint End-to-end Sign Language Recognition and Translation Inproceedings Forthcoming IEEE Conference on Computer Vision and Pattern Recognition 2020, Forthcoming. |
21. | Text2Sign: Towards Sign Language Production Using Neural Machine Translation and Generative Adversarial Networks. Journal Article International Journal of Computer Vision, (128), pp. 891-908, 2020. |
20. | Going beyond Free Viewpoint: Creating Animatable Volumetric Video of Human Performances Journal Article IET Computer Vision, 2020. |
19. | Virtual Frames as Long-Term Reference Frames for HEVC Inter-Prediction Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-2, 2020, ISSN: 2158-4001. |
18. | Optimal Distortion Minimization for 360° Video Compression with VVC Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-3, 2020, ISSN: 2158-4001. |
17. | Efficient VVC Intra Coding for 360° Video with Residual Weighting and Adaptive Quantization Inproceedings 2020 IEEE International Conference on Consumer Electronics (ICCE), pp. 1-5, 2020, ISSN: 2158-4001. |
2019 |
|
16. | Fast CU Size Decisions for HEVC Inter-Prediction Using Support Vector Machines Inproceedings 2019 27th European Signal Processing Conference (EUSIPCO), pp. 1-5, 2019, ISSN: 2076-1465. |
15. | Evaluation of Modifications to CPPPSNR in 360° Video Quality Assessment Inproceedings 2019 27th European Signal Processing Conference (EUSIPCO), pp. 1-5, 2019, ISSN: 2076-1465. |
14. | Quality of Experience (QoE)-Aware Fast Coding Unit Size Selection for HEVC Intra-prediction Journal Article Future Internet, 11 (8), pp. 175, 2019, ISSN: 1999-5903. |
13. | Weakly Supervised Learning with Multi-Stream CNN-LSTM-HMMs to Discover Sequential Parallelism in Sign Language Videos Journal Article IEEE Transactions on Pattern Analysis and Machine Intelligence, pp. 1–1, 2019. |
12. | Markerless Multiview Motion Capture with 3D Shape Model Adaptation Journal Article Computer Graphics Forum, 2019. |
11. | QoE Modelling of High Dynamic Range Video Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. |
10. | Content Adaptive Fast CU Size Selection for HEVC Intra-Prediction Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. |
9. | An adaptive video streaming framework for Scalable HEVC (SHVC) standard Inproceedings 2019 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–2, Institute of Electrical and Electronics Engineers (IEEE), 2019. |
8. | Weakly-Supervised 3D Pose Estimation from a Single Image using Multi-View Consistency Inproceedings 30th British Machine Vision Conference (BMVC 2019), BMVC, 2019. |
7. | Progressive Non-Rigid Registration of Temporal Mesh Sequences Inproceedings European Conference on Visual Media Production, Association for Computing Machinery, London, United Kingdom, 2019, ISBN: 9781450370035, (Best Paper Award). |
2018 |
|
6. | Machine Learning Approaches for Intra-Prediction in HEVC Inproceedings 2018 IEEE 7th Global Conference on Consumer Electronics (GCCE), pp. 206–209, Institute of Electrical and Electronics Engineers (IEEE), 2018. |
5. | End User Video Quality Prediction and Coding Parameters Selection at the Encoder for Robust HEVC Video Transmission Journal Article IEEE Transactions on Circuits and Systems for Video Technology, 2018. |
4. | Sign Language Production using Neural Machine Translation and Generative Adversarial Networks Inproceedings 29th British Machine Vision Conference (BMVC 2018), British Machine Vision Association, 2018. |
3. | Bit allocation and encoding parameter selection for rate-controlled error resilient HEVC video encoding Inproceedings 2018 IEEE International Conference on Consumer Electronics (ICCE), pp. 1–4, Institute of Electrical and Electronics Engineers (IEEE), 2018. |
2. | Decoding-Complexity-Aware HEVC Encoding Using a Complexity-Rate-Distortion Model Journal Article IEEE Transactions on Consumer Electronics, 64 (1), pp. 35–43, 2018. |
1. | Neural Sign Language Translation Inproceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2018, pp. 7784–7793, IEEE, 2018. |