2022
Chaudhuri, Debanjan
Enriching Text-Based Human-Machine Interactions with Additional World Knowledge PhD Thesis
Rheinische Friedrich-Wilhelms-Universität Bonn, 2022.
@phdthesis{handle:20.500.11811/10278,
title = {Enriching Text-Based Human-Machine Interactions with Additional World Knowledge},
author = {Debanjan Chaudhuri},
url = {https://hdl.handle.net/20.500.11811/10278},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Bader, Sebastian Richard
Semantic Digital Twins in the Industrial Internet of Things PhD Thesis
Rheinische Friedrich-Wilhelms-Universität Bonn, 2022.
@phdthesis{handle:20.500.11811/9884,
title = {Semantic Digital Twins in the Industrial Internet of Things},
author = {Sebastian Richard Bader},
url = {https://hdl.handle.net/20.500.11811/9884},
year = {2022},
date = {2022-06-01},
urldate = {2022-06-01},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Lukovnikov, Denis
Deep Learning Methods for Semantic Parsing and Question Answering over Knowledge Graphs PhD Thesis
University of Bonn, 2022.
@phdthesis{DBLP:phd/dnb/Lukovnikov22,
title = {Deep Learning Methods for Semantic Parsing and Question Answering
over Knowledge Graphs},
author = {Denis Lukovnikov},
url = {https://hdl.handle.net/20.500.11811/9810},
year = {2022},
date = {2022-01-01},
school = {University of Bonn},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2021
Mousavinezhad, Najmehsadat
Knowledge Extraction Methods for the Analysis of Contractual Agreements PhD Thesis
Rheinische Friedrich-Wilhelms-Universität Bonn, 2021.
@phdthesis{handle:20.500.11811/9414,
title = {Knowledge Extraction Methods for the Analysis of Contractual Agreements},
author = {Najmehsadat Mousavinezhad},
url = {https://hdl.handle.net/20.500.11811/9414},
year = {2021},
date = {2021-11-01},
urldate = {2021-11-01},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Musyaffa, Fathoni Arief
Comparative Analysis of Open Linked Fiscal Data PhD Thesis
Rheinische Friedrich-Wilhelms-Universität Bonn, 2021.
@phdthesis{Musyaffa2021,
title = {Comparative Analysis of Open Linked Fiscal Data},
author = {Fathoni Arief Musyaffa},
url = {https://bonndoc.ulb.uni-bonn.de/xmlui/handle/20.500.11811/9114},
year = {2021},
date = {2021-06-01},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
abstract = {The open data movement within public administrations has provided data regarding governance publicly. As public administrators and governments produce data and release the data as open data, the volume of the data is highly increasing. One of these datasets is budget and spending data, which has been gaining interest to the extent that several working groups and CSO/NGOs started working on this particular open data domain. The majority of these datasets are part of the open budget and spending datasets, which laid out data regarding how public administrations plan, revise, allocate, and expense their governance funding. The disclosure of public administration budget and spending data is expected to improve governance transparency, accountability, law enforcement, and political participation.
Unfortunately, the analysis of budget and spending datasets is not a trivial task to do for several reasons. First, the quality of open fiscal data varies. Standards and recommendations for publishing open data are available, however, these standards are often not met and no framework specifically addresses fiscal data quality measurements. Second, the datasets are heterogeneous, since it is produced by different public administrations with different business process, accounting practice, requirements, and language. This lead to a challenging task in data integration across public budget and spending data. The structural and linguistic heterogeneity of open budget and spending data makes comparative analysis across datasets difficult to perform. Third, datasets within the budget and spending domain are complicated. To be able to comprehend such data, expertise is needed both from the public accounting/budgeting domain, as well as the technical domain to digest the datasets properly. Fourth, a platform to transform, store, analyze, and visualize datasets is necessary, especially those that make the utilization of semantic analysis is possible. Fifth, there is no conceptual association between datasets, which can be used as a comparison point to analyze fiscal records between compared public administrations. Lastly, there is a lack of methodology to consume and compare linked open fiscal data records across different public administrations.
Our focus in this thesis is hence to perform research to help the community gain a better understanding of open fiscal data, provide analysis of their quality, suggest a way to publish open fiscal data in an improved manner, analyze the open fiscal data heterogeneity while also laying out lessons learned regarding their current state and supporting data formats that are capable for open fiscal data integration. Consequently, a platform to digest, analyze and visualize these datasets is devised, continued with performing experiments on multilingual fiscal data concept mapping and wrapped up with a proof-of-concept description of comparative analysis over linked open fiscal data.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Unfortunately, the analysis of budget and spending datasets is not a trivial task to do for several reasons. First, the quality of open fiscal data varies. Standards and recommendations for publishing open data are available, however, these standards are often not met and no framework specifically addresses fiscal data quality measurements. Second, the datasets are heterogeneous, since it is produced by different public administrations with different business process, accounting practice, requirements, and language. This lead to a challenging task in data integration across public budget and spending data. The structural and linguistic heterogeneity of open budget and spending data makes comparative analysis across datasets difficult to perform. Third, datasets within the budget and spending domain are complicated. To be able to comprehend such data, expertise is needed both from the public accounting/budgeting domain, as well as the technical domain to digest the datasets properly. Fourth, a platform to transform, store, analyze, and visualize datasets is necessary, especially those that make the utilization of semantic analysis is possible. Fifth, there is no conceptual association between datasets, which can be used as a comparison point to analyze fiscal records between compared public administrations. Lastly, there is a lack of methodology to consume and compare linked open fiscal data records across different public administrations.
Our focus in this thesis is hence to perform research to help the community gain a better understanding of open fiscal data, provide analysis of their quality, suggest a way to publish open fiscal data in an improved manner, analyze the open fiscal data heterogeneity while also laying out lessons learned regarding their current state and supporting data formats that are capable for open fiscal data integration. Consequently, a platform to digest, analyze and visualize these datasets is devised, continued with performing experiments on multilingual fiscal data concept mapping and wrapped up with a proof-of-concept description of comparative analysis over linked open fiscal data.
Fathalla, Said
Towards Facilitating Scholarly Communication using Semantic Technologies PhD Thesis
Rheinische Friedrich-Wilhelms-Universität Bonn, 2021.
@phdthesis{said_thesis,
title = {Towards Facilitating Scholarly Communication using Semantic Technologies},
author = {Said Fathalla},
url = {https://hdl.handle.net/20.500.11811/9089},
year = {2021},
date = {2021-05-20},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
abstract = {Web technologies have substantially stimulated the submission of manuscripts, publishing scientific articles, as well as the organization of scholarly events, especially virtual events, when a global crisis occurs, which consequently restricts travels across the globe. Publication in scholarly events, such as conferences, workshops, and symposiums, is essential and pervasive in computer science, engineering, and natural sciences. The past years have witnessed significant growth in scholarly data published on the Web, mostly in unstructured formats, which immolate the embedded semantics and relationships between various entities. These formats restrict the reusability of the data, i.e., data analysis, retrieval, and mining. Therefore, managing, retrieving, and analyzing such data have become quite challenging. Consequently, there is a pressing need to represent this data in a semantic format, i.e., Linked Data, which significantly improves scholarly communication by supporting researchers concerning analyzing, retrieving, and exploring scholarly data. Notwithstanding the considerable advances in technology, publishing and exchanging scholarly data have not substantially changed (i.e., still follows the document-based scheme), thus restricting both developments of research applications in various industries as well as data preservation and exploration. This thesis tackles the problem of facilitating scholarly communication using semantic technologies. The ultimate aim is improving scholarly communication by facilitating the transformation from a document-based to knowledge-based scholarly communication, which helps researchers to examine science itself with a new perspective. Key steps towards the goal have been taken by proposing methodologies as well as a metrics suite for publishing and assessing the quality of scholarly events concerning several criteria, in particular, Computer Science as well as Physics, Mathematics, and Engineering. Within the framework of these criteria, steps towards assessing the quality of scholarly events and recommendations to various stakeholders have been taken. Furthermore, we engineered the Scientific Events Ontology in order to enable the enriched semantic representation of scholarly event metadata. Currently, this ontology is in use on thousands of OpenResearch.org events wiki pages. These steps will have far-reaching implications for the various stakeholders involved in the scholarly communication domain, including authors, sponsors, reviewers, publishers, and libraries. Most of the scholarly data publishers, such as Springer Nature, have taken serious steps towards publishing research data in a semantic form by publishing collated information from across the research landscape, such as research articles, scholarly events, persons, and grants, as knowledge graphs. Interlinking this data will significantly enable the provision of better and more intelligent services for the discovery of scientific work, which opens new opportunities for both scholarly data exploration and analysis. In the direction to this goal, we proposed the Science Knowledge Graph Ontologies suite, which comprises four OWL ontologies for representing the scientific knowledge in various fields of science, including Computer Science, Physics, and Pharmaceutical science. Besides, we developed an upper ontology on top of them for modeling modern science branches and related concepts, such as scientific discovery, instruments, and phenomena.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Dubey, Mohnish
Towards Complex Question Answering over Knowledge Graphs PhD Thesis
University of Bonn, Germany, 2021.
@phdthesis{DBLP:phd/dnb/Dubey21,
title = {Towards Complex Question Answering over Knowledge Graphs},
author = {Mohnish Dubey},
url = {https://hdl.handle.net/20.500.11811/9122},
year = {2021},
date = {2021-01-19},
school = {University of Bonn, Germany},
abstract = {Over the past decade, Knowledge Graphs (KG) have emerged as a prominent repository for storing facts about the world in a linked data architecture. Providing machines with the capability of exploring such Knowledge Graphs and answering natural language questions over them, has been an active area of research. The purpose of this work, is to delve further into the research of retrieving information stored in KGs, based on the natural language questions posed by the user. Knowledge Graph Question Answering (KGQA) aims to produce a concise answer to a user question, such that the user is exempt from using KG vocabulary and overheads of learning a formal query language. Existing KGQA systems have achieved excellent results over Simple Questions, where the information required is limited to a single triple and a single formal query pattern. Our motivation is to improve the performance of KGQA over Complex Questions, where formal query patterns significantly vary, and a single triple is not confining for all the required information. Complex KGQA provides several challenges such as understanding semantics and syntactic structure of questions, Entity Linking, Relation Linking and Answer Representation. Lack of suitable datasets for complex question answering further adds to research gaps. Hence, in this thesis, we focus the research objective of laying the foundations for the advancement of the state-of-the-art for Complex Question Answering over Knowledge Graphs, by providing techniques to solve various challenges and provide resources to fill the research gaps.
First, we propose Normalized Query Structure (NQS), which is a linguistic analyzer module that helps the QA system to detect inputs and intents and the relation between them in the users’ question. NQS acts like an intermediate language between natural language questions and formal expressions to ease the process of query formulation for complex questions. We then developed a framework named LC-QuAD to generate large scale question answering dataset by reversing the process of question answering, thereby translating natural language questions from the formal query using intermediate templates. Our goal is to use this framework for high variations in the query patterns and create a large size dataset with minimum human effort. The first version of the dataset consists of 5,000 complex questions. By extending the LC-QuAD framework to support Reified KGs and crowd-sourcing, we published the second version of the dataset as LC-QuAD 2.0, consisting of 30,000 questions with their paraphrases and has higher complexity and new variations in the questions. To overcome the problem of Entity Linking and Relation Linking in KGQA, we develop EARL, a module performing these two tasks as a single joint task for complex question answering. We develop approaches for this module, first by formalizing the task as an instance of the Generalized Traveling Salesman Problem (GTSP) and the second approach uses machine learning to exploit the connection density between nodes in the Knowledge Graph. Lastly, we create another large scale dataset to answer verbalization and provide results for multiple baseline systems on it. The Verbalization dataset is introduced to make the system’s response more human-like. The NQS based KGQA system was next to the best system in terms of accuracy on the QALD-5 dataset. We empirically prove that NQS is robust to tackle paraphrases of the questions. EARL achieves the state of the art results in Entity Linking and Relation Linking for question answering on several KGQA datasets. The dataset curated in this thesis has helped the research community to move forward in the direction of improving the accuracy of complex question answering as a task as other researchers too developed several KGQA systems and modules around these published datasets. With the large-scale datasets, we have encouraged the use of large scale machine learning, deep learning and emergence of new techniques to advance the state-of-the-art in complex question answering over knowledge graphs. We further developed core components for the KGQA pipeline to overcome the challenges of Question Understanding, Entity-Relation Linking and Answer Verbalization and thus achieve our research objective. All the proposed approaches mentioned in this thesis and the published resources are available at https://github.com/AskNowQA and are released under the umbrella project AskNow.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
First, we propose Normalized Query Structure (NQS), which is a linguistic analyzer module that helps the QA system to detect inputs and intents and the relation between them in the users’ question. NQS acts like an intermediate language between natural language questions and formal expressions to ease the process of query formulation for complex questions. We then developed a framework named LC-QuAD to generate large scale question answering dataset by reversing the process of question answering, thereby translating natural language questions from the formal query using intermediate templates. Our goal is to use this framework for high variations in the query patterns and create a large size dataset with minimum human effort. The first version of the dataset consists of 5,000 complex questions. By extending the LC-QuAD framework to support Reified KGs and crowd-sourcing, we published the second version of the dataset as LC-QuAD 2.0, consisting of 30,000 questions with their paraphrases and has higher complexity and new variations in the questions. To overcome the problem of Entity Linking and Relation Linking in KGQA, we develop EARL, a module performing these two tasks as a single joint task for complex question answering. We develop approaches for this module, first by formalizing the task as an instance of the Generalized Traveling Salesman Problem (GTSP) and the second approach uses machine learning to exploit the connection density between nodes in the Knowledge Graph. Lastly, we create another large scale dataset to answer verbalization and provide results for multiple baseline systems on it. The Verbalization dataset is introduced to make the system’s response more human-like. The NQS based KGQA system was next to the best system in terms of accuracy on the QALD-5 dataset. We empirically prove that NQS is robust to tackle paraphrases of the questions. EARL achieves the state of the art results in Entity Linking and Relation Linking for question answering on several KGQA datasets. The dataset curated in this thesis has helped the research community to move forward in the direction of improving the accuracy of complex question answering as a task as other researchers too developed several KGQA systems and modules around these published datasets. With the large-scale datasets, we have encouraged the use of large scale machine learning, deep learning and emergence of new techniques to advance the state-of-the-art in complex question answering over knowledge graphs. We further developed core components for the KGQA pipeline to overcome the challenges of Question Understanding, Entity-Relation Linking and Answer Verbalization and thus achieve our research objective. All the proposed approaches mentioned in this thesis and the published resources are available at https://github.com/AskNowQA and are released under the umbrella project AskNow.
Mami, Mohamed Nadjib
Strategies for a Semantified Uniform Access to Large and Heterogeneous Data Sources PhD Thesis
University of Bonn, Germany, 2021.
@phdthesis{DBLP:phd/dnb/Mami21,
title = {Strategies for a Semantified Uniform Access to Large and Heterogeneous
Data Sources},
author = {Mohamed Nadjib Mami},
url = {https://hdl.handle.net/20.500.11811/8925},
year = {2021},
date = {2021-01-01},
school = {University of Bonn, Germany},
abstract = {The remarkable advances achieved in both research and development of Data Management as well as the prevalence of high-speed Internet and technology in the last few decades have caused unprecedented data avalanche. Large volumes of data manifested in a multitude of types and formats are being generated and becoming the new norm. In this context, it is crucial to both leverage existing approaches and propose novel ones to overcome this data size and complexity, and thus facilitate data exploitation. In this thesis, we investigate two major approaches to addressing this challenge: Physical Data Integration and Logical Data Integration. The specific problem tackled is to enable querying large and heterogeneous data sources in an ad hoc manner.
In the Physical Data Integration, data is physically and wholly transformed into a canonical unique format, which can then be directly and uniformly queried. In the Logical Data Integration, data remains in its original format and form and a middleware is posed above the data allowing to map various schemata elements to a high-level unifying formal model. The latter enables the querying of the underlying original data in an ad hoc and uniform way, a framework which we call Semantic Data Lake, SDL. Both approaches have their advantages and disadvantages. For example, in the former, a significant effort and cost are devoted to pre-processing and transforming the data to the unified canonical format. In the latter, the cost is shifted to the query processing phases, e.g., query analysis, relevant source detection and results reconciliation.
In this thesis we investigate both directions and study their strengths and weaknesses. For each direction, we propose a set of approaches and demonstrate their feasibility via a proposed implementation. In both directions, we appeal to Semantic Web technologies, which provide a set of time-proven techniques and standards that are dedicated to Data Integration. In the Physical Integration, we suggest an end-to-end blueprint for the semantification of large and heterogeneous data sources, i.e., physically transforming the data to the Semantic Web data standard RDF (Resource Description Framework). A unified data representation, storage and query interface over the data are suggested. In the Logical Integration, we provide a description of the SDL architecture, which allows querying data sources right on their original form and format without requiring a prior transformation and centralization. For a number of reasons that we detail, we put more emphasis on the virtual approach. We present the effort behind an extensible implementation of the SDL, called Squerall, which leverages state-of-the-art Semantic and Big Data technologies, e.g., RML (RDF Mapping Language) mappings, FnO (Function Ontology) ontology, and Apache Spark. A series of evaluation is conducted to evaluate the implementation along with various metrics and input data scales. In particular, we describe an industrial real-world use case using our SDL implementation. In a preparation phase, we conduct a survey for the Query Translation methods in order to back some of our design choices.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
In the Physical Data Integration, data is physically and wholly transformed into a canonical unique format, which can then be directly and uniformly queried. In the Logical Data Integration, data remains in its original format and form and a middleware is posed above the data allowing to map various schemata elements to a high-level unifying formal model. The latter enables the querying of the underlying original data in an ad hoc and uniform way, a framework which we call Semantic Data Lake, SDL. Both approaches have their advantages and disadvantages. For example, in the former, a significant effort and cost are devoted to pre-processing and transforming the data to the unified canonical format. In the latter, the cost is shifted to the query processing phases, e.g., query analysis, relevant source detection and results reconciliation.
In this thesis we investigate both directions and study their strengths and weaknesses. For each direction, we propose a set of approaches and demonstrate their feasibility via a proposed implementation. In both directions, we appeal to Semantic Web technologies, which provide a set of time-proven techniques and standards that are dedicated to Data Integration. In the Physical Integration, we suggest an end-to-end blueprint for the semantification of large and heterogeneous data sources, i.e., physically transforming the data to the Semantic Web data standard RDF (Resource Description Framework). A unified data representation, storage and query interface over the data are suggested. In the Logical Integration, we provide a description of the SDL architecture, which allows querying data sources right on their original form and format without requiring a prior transformation and centralization. For a number of reasons that we detail, we put more emphasis on the virtual approach. We present the effort behind an extensible implementation of the SDL, called Squerall, which leverages state-of-the-art Semantic and Big Data technologies, e.g., RML (RDF Mapping Language) mappings, FnO (Function Ontology) ontology, and Apache Spark. A series of evaluation is conducted to evaluate the implementation along with various metrics and input data scales. In particular, we describe an industrial real-world use case using our SDL implementation. In a preparation phase, we conduct a survey for the Query Translation methods in order to back some of our design choices.
Mulang', Isaiah Onando
Knowledge Context for Entity and Relation Linking PhD Thesis
University of Bonn, Germany, 2021.
@phdthesis{DBLP:phd/dnb/Mulang21,
title = {Knowledge Context for Entity and Relation Linking},
author = {Isaiah Onando Mulang'{}},
url = {https://hdl.handle.net/20.500.11811/9384},
year = {2021},
date = {2021-01-01},
school = {University of Bonn, Germany},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Thakker, Harsh Vrajeshkumar
On Supporting Interoperability between RDF and Property Graph Databases PhD Thesis
University of Bonn, Germany, 2021.
@phdthesis{DBLP:phd/dnb/Thakker21,
title = {On Supporting Interoperability between RDF and Property Graph Databases},
author = {Harsh Vrajeshkumar Thakker},
url = {https://hdl.handle.net/20.500.11811/9083},
year = {2021},
date = {2021-01-01},
school = {University of Bonn, Germany},
abstract = {Over the last few years, the amount and availability of machine-readable Open, Linked, and Big data on the web has increased. Simultaneously, several data management systems have emerged to deal with the increased amounts of this structured data. RDF and Graph databases are two popular approaches for data management based on modeling, storing, and querying graph-like data. RDF database systems are based on the W3C standard RDF data model and use the W3C standard SPARQL as their defacto query language. Most graph database systems are based on the Property Graph (PG) data model and use the Gremlin language as their query language due to its popularity amongst vendors. Given that both of these approaches have distinct and complementary characteristics – RDF is suited for distributed data integration with built-in world-wide unique identifiers and vocabularies; PGs, on the other hand, support horizontally scalable storage and querying, and are widely used for modern data analytics applications, – it becomes necessary to support interoperability amongst them. The main objective of this dissertation is to study and address this interoperability issue. We identified three research challenges that are concerned with the data interoperability, query interoperability, and benchmarking of these databases. First, we tackle the data interoperability problem. We propose three direct mappings (schema-dependent and schema-independent) for transforming an RDF database into a property graph database. We show that the proposed mappings satisfy the desired properties of semantics preservation and information preservation. Based on our analysis (both formal and empirical), we argue that any RDF database can be transformed into a PG database using our approach. Second, we propose a novel approach for querying PG databases using SPARQL using Gremlin traversals – GREMLINATOR to tackle the query interoperability problem. In doing so, we first formalize the declarative constructs of Gremlin language using a consolidated graph relational algebra and define mappings to translate SPARQL queries into Gremlin traversals. GREMLINATOR has been officially integrated as a plugin for the Apache TinkerPop graph computing framework (as sparql-gremlin), which enables users to execute SPARQL queries over a wide variety of OLTP graph databases and OLAP graph processing frameworks. Finally, we tackle the third, benchmarking (performance evaluation), problem. We propose a novel framework – LITMUS Benchmark Suite that allows a choke-point driven performance comparison and analysis of various databases (PG and RDF-based) using various third-party real and synthetic datasets and queries. We also studied a variety of intrinsic and extrinsic factors – data and system-specific metrics and Key Performance Indicators (KPIs) that influence a given system’s performance. LITMUS incorporates various memory, processor, data quality, indexing, query typology, and data-based metrics for providing a fine-grained evaluation of the benchmark. In conclusion, by filling the research gaps, addressed by this dissertation, we have laid a solid formal and practical foundation for supporting interoperability between the RDF and Property graph database technology stacks. The artifacts produced during the term of this dissertation have been integrated into various academic and industrial projects.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2020
Höffner, Konrad
Question Answering on RDF Data Cubes PhD Thesis
Leipzig University, Germany, 2020.
@phdthesis{DBLP:phd/dnb/Hoffner21,
title = {Question Answering on RDF Data Cubes},
author = {Konrad Höffner},
url = {https://nbn-resolving.org/urn:nbn:de:bsz:15-qucosa2-742429},
year = {2020},
date = {2020-01-01},
school = {Leipzig University, Germany},
abstract = {The Semantic Web, a Web of Data, is an extension of the World Wide Web (WWW), a Web of Documents. A large amount of such data is freely available as Linked Open Data (LOD) for many areas of knowledge, forming the LOD Cloud. While this data conforms to the Resource Description Framework (RDF) and can thus be processed by machines, users need to master a formal query language and learn a specific vocabulary. Semantic Question Answering (SQA) systems remove those access barriers by letting the user ask natural language questions that the systems translate into formal queries. Thus, the research area of SQA plays an important role for the acceptance and benefit of the Semantic Web. The original contributions of this thesis to SQA are: First, we survey the current state of the art of SQA. We complement existing surveys by systematically identifying SQA publications in the chosen timeframe. 72 publications describing 62 different systems are systematically and manually selected using predefined inclusion and exclusion criteria out of 1960 candidates from the end of 2010 to July 2015. The survey identifies common challenges, structured solutions, and recommendations on research opportunities for future systems. From that point on, we focus on multidimensional numerical data, which is immensely valuable as it influences decisions in health care, policy and finance, among others. With the growth of the open data movement, more and more of it is becoming freely available. A large amount of such data is included in the LOD cloud using the RDF Data Cube (RDC) vocabulary. However, consuming multidimensional numerical data requires experts and specialized tools. Traditional SQA systems cannot process RDCs because their meta-structure is opaque to applications that expect facts to be encoded in single triples, This motivates our second contribution, the design and implementation of the first SQA algorithm on RDF Data Cubes. We kick-start this new research subfield by creating a user question corpus and a benchmark over multiple data sets. The evaluation of our system on the benchmark, which is included in the public Question Answering over Linked Data (QALD) challenge of 2016, shows the feasibility of the approach, but also highlights challenges, which we discuss in detail as a starting point for future work in the field. The benchmark is based on our final contribution, the addition of 955 financial government spending data sets to the LOD cloud by transforming data sets of the OpenSpending project to RDF Data Cubes. Open spending data has the power to reduce corruption by increasing accountability and strengthens democracy because voters can make better informed decisions. An informed and trusting public also strengthens the government itself because it is more likely to commit to large projects. OpenSpending.org is an open platform that provides public finance data from governments around the world. The transformation result, called Linked Spending, consists of more than five million planned and carried out financial transactions in 955 data sets from all over the world as Linked Open Data and is freely available and openly licensed.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Sejdiu, Gezim
Efficient Distributed In-Memory Processing of RDF Datasets PhD Thesis
University of Bonn, Germany, 2020.
@phdthesis{DBLP:phd/dnb/Sejdiu20,
title = {Efficient Distributed In-Memory Processing of RDF Datasets},
author = {Gezim Sejdiu},
url = {https://hdl.handle.net/20.500.11811/8735},
year = {2020},
date = {2020-01-01},
school = {University of Bonn, Germany},
abstract = {Over the past decade, vast amounts of machine-readable structured information have become available through the automation of research processes as well as the increasing popularity of knowledge graphs and semantic technologies. Today, we count more than 10,000 datasets made available online following Semantic Web standards. A major and yet unsolved challenge that research faces today is to perform scalable analysis of large-scale knowledge graphs in order to facilitate applications in various domains including life sciences, publishing, and the internet of things. The main objective of this thesis is to lay foundations for efficient algorithms performing analytics, i.e. exploration, quality assessment, and querying over semantic knowledge graphs at a scale that has not been possible before. First, we propose a novel approach for statistical calculations of large RDF datasets, which scales out to clusters of machines. In particular, we describe the first distributed in-memory approach for computing 32 different statistical criteria for RDF datasets using Apache Spark. Many applications such as data integration, search, and interlinking, may take full advantage of the data when having a priori statistical information about its internal structure and coverage. However, such applications may suffer from low quality and not being able to leverage the full advantage of the data when the size of data goes beyond the capacity of the resources available. Thus, we introduce a distributed approach of quality assessment of large RDF datasets. It is the first distributed, in-memory approach for computing different quality metrics for large RDF datasets using Apache Spark. We also provide a quality assessment pattern that can be used to generate new scalable metrics that can be applied to big data. Based on the knowledge of the internal statistics of a dataset and its quality, users typically want to query and retrieve large amounts of information. As a result, it has become difficult to efficiently process these large RDF datasets. Indeed, these processes require, both efficient storage strategies and query-processing engines, to be able to scale in terms of data size. Therefore, we propose a scalable approach to evaluate SPARQL queries over distributed RDF datasets by translating SPARQL queries into Spark executable code. We conducted several empirical evaluations to assess the scalability, effectiveness, and efficiency of our proposed approaches. More importantly, various use cases i.e. Ethereum analysis, Mining Big Data Logs, and Scalable Integration of POIs, have been developed and leverages by our approach. The empirical evaluations and concrete applications provide evidence that our methodology and techniques proposed during this thesis help to effectively analyze and process large-scale RDF datasets. All the proposed approaches during this thesis are integrated into the larger SANSA framework.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Zafartavanaelmi, Hamid
Semantic Question Answering Over Knowledge Graphs: Pitfalls and Pearls PhD Thesis
University of Bonn, 2020.
@phdthesis{ZafarThesis,
title = {Semantic Question Answering Over Knowledge Graphs: Pitfalls and Pearls},
author = {Hamid Zafartavanaelmi},
url = {https://bonndoc.ulb.uni-bonn.de/xmlui/bitstream/handle/20.500.11811/9125/6154.pdf?sequence=1&isAllowed=y},
year = {2020},
date = {2020-01-01},
school = {University of Bonn},
abstract = {Nowadays, the Web provides an infrastructure to share all kinds of information which are easily accessible to humans around the world. Furthermore, the amount of information is growing rapidly and requires computing machines to process, comprehend, and extract useful information tailored for the end-users. The Semantic Web and semantic technologies play a prominent role to enable knowledge representation and reasoning for these computational processes. Semantic technologies such as ontologies and knowledge graphs are being used in various application domains, including data governance, knowledge management, chatbots, biology, etc., which aim at providing proper infrastructure to analyze the knowledge and reasoning for the computers. Semantic Question Answering systems are among the most desired platforms in recent years that facilitate access to information in knowledge graphs. They provide a natural language interface that permits the users to ask their questions posed in a natural language, without any understanding of the underlying technologies. We thus study question answering systems over knowledge graphs which aim to map an input question in natural language into a formal query, intending to retrieve a concise answer from the knowledge graph. This is a highly challenging task due to the intrinsic complexity of the natural language, such that the resulting query does not always accurately subserve the user intent, particularly, for more complex and less common questions.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2019
Halilaj, Lavdim
An Approach for Collaborative Ontology Development in Distributed and Heterogeneous Environments PhD Thesis
University of Bonn, Germany, 2019.
@phdthesis{DBLP:phd/dnb/Halilaj19,
title = {An Approach for Collaborative Ontology Development in Distributed
and Heterogeneous Environments},
author = {Lavdim Halilaj},
url = {http://hss.ulb.uni-bonn.de/2019/5315/5315.htm},
year = {2019},
date = {2019-01-01},
school = {University of Bonn, Germany},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Singh, Kuldeep
Towards Dynamic Composition of Question Answering Pipelines PhD Thesis
University of Bonn, Germany, 2019.
@phdthesis{DBLP:phd/dnb/Singh19,
title = {Towards Dynamic Composition of Question Answering Pipelines},
author = {Kuldeep Singh},
url = {http://hss.ulb.uni-bonn.de/2019/5470/5470.htm},
year = {2019},
date = {2019-01-01},
school = {University of Bonn, Germany},
abstract = {Question answering (QA) over knowledge graphs has gained significant momentum over the past five years due to the increasing availability of large knowledge graphs and the rising importance of question answering for user interaction. DBpedia has been the most prominently used knowledge graph in this setting. QA systems implement a pipeline connecting a sequence of QA components for translating an input question into its corresponding formal query (e.g. SPARQL); this query will be executed over a knowledge graph in order to produce the answer of the question. Recent empirical studies have revealed that albeit overall effective, the performance of QA systems and QA components depends heavily on the features of input questions, and not even the combination of the best performing QA systems or individual QA components retrieves complete and correct answers. Furthermore, these QA systems cannot be easily reused, extended, and results cannot be easily reproduced since the systems are mostly implemented in a monolithic fashion, lack standardised interfaces, and are often not open source or available as Web services. All these drawbacks of the state of the art that prevents many of these approaches to be employed in real-world applications. In this thesis, we tackle the problem of QA over the knowledge graph and propose a generic approach to promote reusability and build question answering systems in a collaborative effort. Firstly, we define qa vocabulary and Qanary methodology to develop an abstraction level on existing QA systems and components. Qanary relies on qa vocabulary to establish guidelines for semantically describing the knowledge exchange between the components of a QA system. We implement a component-based modular framework called "Qanary Ecosystem" utilising the Qanary methodology to integrate several heterogeneous QA components in a single platform. We further present Qaestro framework that provides an approach to semantically describing question answering components and effectively enumerates QA pipelines based on a QA developer requirements. Qaestro provides all valid combinations of available QA components respecting the input-output requirement of each component to build QA pipelines. Finally, we address the scalability of QA components within a framework and propose a novel approach that chooses the best component per task to automatically build QA pipeline for each input question. We implement this model within FRANKENSTEIN, a framework able to select QA components and compose pipelines. FRANKENSTEIN extends Qanary ecosystem and utilises qa vocabulary for data exchange. It has 29 independent QA components implementing five QA tasks resulting in 360 unique QA pipelines. Each approach proposed in this thesis (Qanary methodology, Qaestro, and FRANKENSTEIN) is supported by extensive evaluation to demonstrate their effectiveness. Our contributions target a broader research agenda of offering the QA community an efficient way of applying their research to a research field which is driven by many different fields, consequently requiring a collaborative approach to achieve significant progress in the domain of question answering.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Vargas, Diego Collarana
Strategies and Techniques for Federated Semantic Knowledge Retrieval and Integration PhD Thesis
University of Bonn, Germany, 2019, ISBN: 978-1-64368-046-0.
@phdthesis{DBLP:phd/dnb/Vargas19,
title = {Strategies and Techniques for Federated Semantic Knowledge Retrieval
and Integration},
author = {Diego Collarana Vargas},
url = {https://doi.org/10.3233/SSW200002-mono},
doi = {10.3233/SSW200002-mono},
isbn = {978-1-64368-046-0},
year = {2019},
date = {2019-01-01},
school = {University of Bonn, Germany},
abstract = {The vast amount of data shared on the Web requires effective and efficient techniques to retrieve and create machine usable knowledge out of it. The creation of integrated knowledge from the Web, especially knowledge about the same entity spread over different web data sources, is a challenging task. Several data interoperability problems such as schema, structure, or domain conflicts need to be solved during the integration process. Semantic Web Technologies have evolved as a novel approach to tackle the problem of knowledge integration out of heterogeneous data. However, knowledge retrieval and integration from web data sources is an expensive process, mainly due to the Extraction-Transformation-Load approach that predominates the process. In addition, there are increasingly many scenarios, where a full physical integration of the data is either prohibitive (e.g. due to data being hidden behind APIs) or not allowed (e.g. for data privacy concerns). Thus, a more cost-effective and federated integration approach is needed, a method that supports organizations to create valuable insights out of the heterogeneous data spread on web sources. In this thesis, we tackle the problem of knowledge retrieval an integration from heterogeneous web sources and propose a holistic semantic knowledge retrieval and integration approach that creates knowledge graphs on-demand from a federation of web sources. We focus on the representation of web sources data, which belongs to the same entity, as pieces of knowledge to then synthesize them as knowledge graph solving interoperability conflicts at integration time. First, we propose MINTE, a novel semantic integration approach that solves interoperability conflicts present in heterogeneous web sources. MINTE defines the concept of RDF molecules to represent web sources data as pieces of knowledge. Then, MINTE relies on a semantic similarity function to determine RDF molecules belonging to the same entity. Finally, MINTE employs fusion policies for the synthesis of RDF molecules into a knowledge graph. Second, we define a similarity framework for RDF molecules to identify semantically equivalent entities. The framework includes state-of-the-art semantic similarity metrics, such as GADES, but also a semantic similarity metric based on embeddings named MateTee developed in the scope of this thesis. Ultimately, based on MINTE and our similarity framework, we design a federated semantic retrieval engine named FuhSen. FuhSen is able to effectively integrate data from heterogeneous web data sources and create an integrated knowledge graphs on-demand. FuhSen is equipped with a faceted browsing user interface oriented to facilitate the exploration of on-demand built knowledge graphs. We conducted several empirical evaluations to assess the effectiveness and efficiency of our holistic approach. More importantly, three domain applications, i.e., Law Enforcement, Job Market Analysis, and Manufacturing, have been developed and managed by our approach. Both the empirical evaluations and concrete applications provide evidence that the methodology and techniques proposed in this thesis help to effectively integrate the pieces of knowledge about entities that are spread over heterogeneous web data sources.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Esteves, Diego
Automating the Fact-Checking Task: Challenges and Directions PhD Thesis
University of Bonn, 2019.
@phdthesis{EstevesThesis,
title = {Automating the Fact-Checking Task: Challenges and Directions},
author = {Diego Esteves},
url = {http://hdl.handle.net/20.500.11811/8030},
year = {2019},
date = {2019-01-01},
school = {University of Bonn},
abstract = {In recent years, misinformation has caused widespread alarm and has become a global concern, given the negative impact placed on society, democratic institutions and even computing systems whose primary objective is to serve as a reliable information channel, e.g., KnowledgeBases (KBs). The proliferation of fake news has a wide range of characteristics and different motivations. For instance, it can be produced unintentionally (e.g., the creation process of KBswhich is mostly based on automated information extraction methods, thus naturally error-prone)or intentionally (e.g., the spread of misinformation through social media to persuade). Thus, they differ considerably in complexity, structure and number of arguments and propositions. To further exacerbate this problem, an ever-increasing amount of fake news on the Web has created another challenge to drawing correct information. This huge sea of data makes it very difficult for human fact-checkers and journalists to assess all the information manually. Therefore, addressing this problem is of utmost importance to minimize real-world circumstances which may provoke a negative impact on society, in general. Presently, Fact-Checking has emerged as a branch of natural language processing devoted to achieving this feat. Under this umbrella, Automated Fact-Checking frameworks have been proposed to perform claim verification. However, given the nature of the problem, different tasks need to be performed, from natural language understanding to source trustworthiness analysis and credibility scoring. In this thesis, we tackle the problem of fake news and underlying challenges related to the process of estimating the veracity of a given claim, discussing challenges and proposing novel models to improve the current state of the art on different sub-tasks. Thus, besides the principal task (i.e., performing automated fact-checking) we also investigate the recognition of entities on noisy data and the computation of web site credibility. Ultimately, due to the challenging nature of the automated fact-checking task - which requires a complex analysis over several perspectives - we also contribute towards the reproducibility of scientific experiments. First, we tackle the named entity recognition problem. We propose a novel multi-level approach named HORUS which - given an input token - generates heuristics based on computer vision and text mining techniques. These heuristics are then used to detect and classify named entities on noisy data (e.g., The Web). Second, we propose WebCred, a novel model to compute the credibility score of a given website, regardless of dependency on search engine results, which is a limiting factor when dealing with real scenarios. WebCreddoes not require any third-party service and is 100% open-source. Third, we conduct several empirical evaluations and extend DeFacto, a fact-checking framework initially designed to verify English claims in RDF format. DeFacto supports both structured claims (e.g., triple-like) as well as complex claims (i.e., natural language sentences). Last, but not least, we consistently contributed towards better reproducibility research tools, methods, and methodologies. We proposed ontologies (MEX, ML-Schema) and tools (LOG4MEX, MEX-Interfaces, Web4MEX, WASOTA) which turned into state of the art for better reproducibility of machine learning experiments, becoming part of a global W3C community},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Vahdati, Sahar
Collaborative Integration, Publishing and Analysis of Distributed Scholarly Metadata PhD Thesis
University of Bonn, 2019.
@phdthesis{VahdatiThesis,
title = {Collaborative Integration, Publishing and Analysis of Distributed Scholarly Metadata},
author = {Sahar Vahdati},
url = {https://d-nb.info/1181855969/34},
year = {2019},
date = {2019-01-01},
school = {University of Bonn},
abstract = {Research is becoming increasingly digital, interdisciplinary, and data-driven and affects different en- vironments in addition to academia, such as industry, and government. Research output representation, publication, mining, analysis, and visualization are taken to a new level, driven by the increased use of Web standards and digital scholarly communication initiatives. The number of scientific publications produced by new players and the increasing digital availability of scholarly artifacts, and associated metadata are other drivers of the substantial growth in scholarly communication. The heterogeneity of scholarly artifacts and their metadata spread over different Web data sources poses a major challenge for researchers with regard to search, retrieval and exploration. For example, it has become difficult to keep track of relevant scientific results, to stay up-to-date with new scientific events and running projects, as well as to find potential future collaborators. Thus, assisting researchers with a broader integration, management, and analysis of scholarly metadata can lead to new opportunities in research and to new ways of conducting research. The data integration problem has been extensively addressed by communities in the Database, Artificial Intelligence and Semantic Web fields. However, a share of the interoperability issues are domain specific and new challenges with regard to schema, structure, or domain, arise in the context of scholarly metadata integration. Thus, a method is needed to support scientific communities to integrate and manage heterogeneous scholarly metadata in order to derive insightful analysis (e.g., quality assessment of scholarly artifacts). This thesis tackles the problem of scholarly metadata integration and develops a life cycle methodology to facilitate the integrated use of different methods, analysis techniques, and tools for improving scholarly communication. Some key steps of the metadata life cycle are implemented using a collaborative platform, which allows to keep the research communities in the loop. In particular, the use of collaborative methods is beneficial for the acquisition, integration, curation and utilization of scholarly metadata. We conducted empirical evaluations to assess the effectiveness and efficiency of the proposed approach. Our metadata transformation from legacy resources achieves reasonable performance and results in better metadata maintainability. The interlinking of metadata enhances the coherence of scholarly information spaces both qualitatively and quantitatively. Our metadata analysis techniques provide a precise quality assessment of scholarly artifacts, taking into account the perspectives of multiple stakeholders, while maintaining compatibility with existing ranking systems. These empirical evaluations and the concrete applications with a particular focus on collaborative aspects demonstrate the benefits of integrating distributed scholarly metadata.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2018
Galkin, Mikhail
Strategies for Managing Linked Enterprise Data PhD Thesis
University of Bonn, 2018.
@phdthesis{GalkinThesis,
title = {Strategies for Managing Linked Enterprise Data},
author = {Mikhail Galkin},
url = {http://hdl.handle.net/20.500.11811/7856},
year = {2018},
date = {2018-01-01},
school = {University of Bonn},
abstract = {Data, information and knowledge become key assets of our 21st century economy. As a result, data and knowledge management become key tasks with regard to sustainable development and business success. Often, knowledge is not explicitly represented residing in the minds of people or scattered among a variety of data sources. Knowledge is inherently associated with semantics that conveys its meaning to a human or machine agent. The Linked Data concept facilitates the semantic integration of heterogeneous data sources. However, we still lack an effective knowledge integration strategy applicable to enterprise scenarios, which balances between large amounts of data stored in legacy information systems and data lakes as well as tailored domain specific ontologies that formally describe real-world concepts. In this thesis we investigate strategies for managing linked enterprise data analyzing how actionable knowledge can be derived from enterprise data leveraging knowledge graphs. Actionable knowledge provides valuable insights, supports decision makers with clear interpretable arguments, and keeps its inference processes explainable. The benefits of employing actionable knowledge and its coherent management strategy span from a holistic semantic representation layer of enterprise data, i.e., representing numerous data sources as one, consistent, and integrated knowledge source, to unified interaction mechanisms with other systems that are able to effectively and efficiently leverage such an actionable knowledge. Several challenges have to be addressed on different conceptual levels pursuing this goal, i.e., means for representing knowledge, semantic data integration of raw data sources and subsequent knowledge extraction, communication interfaces, and implementation. In order to tackle those challenges we present the concept of Enterprise Knowledge Graphs (EKGs), describe their characteristics and advantages compared to existing approaches. We study each challenge with regard to using EKGs and demonstrate their efficiency. In particular, EKGs are able to reduce the semantic data integration effort when processing large-scale heterogeneous datasets. Then, having built a consistent logical integration layer with heterogeneity behind the scenes, EKGs unify query processing and enable effective communication interfaces for other enterprise systems. The achieved results allow us to conclude that strategies for managing linked enterprise data based on EKGs exhibit reasonable performance, comply with enterprise requirements, and ensure integrated data and knowledge management throughout its life cycle.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2017
Usbeck, Ricardo
Knowledge Extraction for Hybrid Question Answering PhD Thesis
Leipzig University, Germany, 2017.
@phdthesis{DBLP:phd/dnb/Usbeck17,
title = {Knowledge Extraction for Hybrid Question Answering},
author = {Ricardo Usbeck},
url = {https://nbn-resolving.org/urn:nbn:de:bsz:15-qucosa-225097},
year = {2017},
date = {2017-01-01},
school = {Leipzig University, Germany},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2016
Sherif, Mohamed Ahmed
Automating Geospatial RDF Dataset Integration and Enrichment PhD Thesis
University of Leipzig, 2016.
@phdthesis{Sherif-thesis-2016,
title = {Automating Geospatial RDF Dataset Integration and Enrichment},
author = {Mohamed Ahmed Sherif},
url = {https://nbn-resolving.org/urn:nbn:de:bsz:15-qucosa-215708},
year = {2016},
date = {2016-12-01},
address = {Leipzig, Germany},
school = {University of Leipzig},
abstract = {Over the last years, the Linked Open Data (LOD) has evolved from a mere 12 to more than 10, 000 knowledge bases. These knowledge bases come from diverse domains including (but not limited to) publications, life sciences, social networking, government, media, linguistics. Moreover, the LOD cloud also contains a large number of crossdomain knowledge bases such as DBpedia and Yago2. These knowledge bases are commonly managed in a decentralized fashion and contain partly overlapping information. This architectural choice has led to knowledge pertaining to the same domain being published by independent entities in the LOD cloud. For example, information on drugs can be found in Diseasome as well as DBpedia and Drugbank. Furthermore, certain knowledge bases such as DBLP have been published by several bodies, which in turn has lead to duplicated content in the LOD. In addition, large amounts of geo-spatial information have been made available with the growth of heterogeneous Web of Data. The concurrent publication of knowledge bases containing related information promises to become a phenomenon of increasing importance with the growth of the number of independent data providers. Enabling the joint use of the knowledge bases published by these providers for tasks such as federated queries, cross-ontology question answering and data integration is most commonly tackled by creating links between the resources described within these knowledge bases. Within this thesis, we spur the transition from isolated knowledge bases to enriched Linked Data sets where information can be easily integrated and processed. To achieve this goal, we provide concepts, approaches and use cases that facilitate the integration and enrichment of information with other data types that are already present on the Linked Data Web with a focus on geo-spatial data. The first challenge that motivates our work is the lack of measures that use the geographic data for linking geo-spatial knowledge bases. This is partly due to the geo-spatial resources being described by the means of vector geometry. In particular, discrepancies in granularity and error measurements across knowledge bases render the selection of appropriate distance measures for geo-spatial resources difficult. We address this challenge by evaluating existing literature for pointset measures that can be used to measure the similarity of vector geometries. Then, we present and evaluate the ten measures that we derived from the literature on samples of three real knowledge bases. The second challenge we address in this thesis is the lack of automatic Link Discovery (LD) approaches capable of dealing with geospatial knowledge bases with missing and erroneous data. To this end,we present Colibri, an unsupervised approach that allows discovering links between knowledge bases while improving the quality of the instance data in these knowledge bases. A Colibri iteration begins by generating links between knowledge bases. Then, the approach makes use of these links to detect resources with probably erroneous or missing information. This erroneous or missing infor- mation detected by the approach is finally corrected or added. The third challenge we address is the lack of scalable LD approaches for tackling big geo-spatial knowledge bases. Thus, we present Deterministic Particle-Swarm Optimization (DPSO), a novel load balancing technique for LD on parallel hardware based on particle-swarm optimization. We combine this approach with the Orchid algorithm for geo-spatial linking and evaluate it on real and artificial data sets. The lack of approaches for automatic updating of links of an evolving knowledge base is our fourth challenge. This challenge is addressed in this thesis by the Wombat algorithm. Wombat is a novel approach for the discovery of links between knowledge bases that relies exclusively on positive examples. Wombat is based on generalisation via an upward refinement operator to traverse the space of Link Specifications (LS). We study the theoretical characteristics of Wombat and evaluate it on different benchmark data sets. The last challenge addressed herein is the lack of automatic approaches for geo-spatial knowledge base enrichment. Thus, we propose Deer, a supervised learning approach based on a refinement operator for enriching Resource Description Framework (RDF) data sets. We show how we can use exemplary descriptions of enriched resources to generate accurate enrichment pipelines. We evaluate our approach against manually defined enrichment pipelines and show that our approach can learn accurate pipelines even when provided with a small number of training examples. Each of the proposed approaches is implemented and evaluated against state-of-the-art approaches on real and/or artificial data sets. Moreover, all approaches are peer-reviewed and published in a con- ference or a journal paper. Throughout this thesis, we detail the ideas, implementation and the evaluation of each of the approaches. Moreover, we discuss each approach and present lessons learned. Finally, we conclude this thesis by presenting a set of possible future extensions and use cases for each of the proposed approaches.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}