This dataset corresponds to the paper “Connecting the Models: A Global Mega-model of MDE Projects on GitHub”.
The dataset consists of the following elements:
The relevant tables of the schema are:
-- All projects crawled from GitHub which may contain MDE artefacts
CREATE TABLE projects (
project_path VARCHAR(255),
name VARCHAR(255),
PRIMARY KEY (project_path)
);
-- All artefacts with relevant extensions and contents related to MDE
CREATE TABLE files (
project_path VARCHAR(255),
file_path TEXT,
filename VARCHAR(255),
extension VARCHAR(32),
type VARCHAR(32),
created_at TEXT,
created_commit TEXT,
created_author TEXT,
updated_at TEXT,
updated_commit TEXT,
updated_author TEXT,
PRIMARY KEY (file_path)
);
-- Project information found in GitHub (id refers to projects.project_path)
CREATE TABLE repo_info(
id TEXT,
name TEXT,
full_name TEXT,
description TEXT,
total_issues INT,
total_prs INT,
stars INT,
watchers INT,
forks INT,
subscribers INT,
topics TEXT,
labels TEXT,
created_at TEXT,
updated_at TEXT,
contributors_count INT,
contributors_detail TEXT,
parent_repo TEXT,
repo_type TEXT
);
-- Project information from Gi (id refers to projects.project_path)
CREATE TABLE repo_git(
id TEXT,
ci TEXT,
readme_size INT,
commits_per_month REAL,
license TEXT
);The relevant parts of the schema of the database is as follows:
-- Repositories (from the RawDb)
CREATE TABLE projects (
id varchar(255) PRIMARY KEY,
url text NOT NULL
);
-- Artefact nodes (i.e., nodes that correspond to artefacts)
CREATE TABLE artefacts (
id varchar(255) PRIMARY KEY,
type varchar(255) NOT NULL,
category varchar(255) NOT NULL,
name varchar(255) NOT NULL,
file_status varchar(255) NOT NULL,
project_id varchar(255)
);
-- Duplicates found
CREATE TABLE duplication (
group_id varchar(255) NOT NULL,
node_id varchar(255) NOT NULL,
type varchar(255) NOT NULL,
PRIMARY KEY (group_id, node_id)
);
-- Relationships (the same source/target can be repeated with a different type to have edges with multiple annotations)
CREATE TABLE relationships (
source varchar(255) NOT NULL,
target varchar(255) NOT NULL,
type varchar (255) NOT NULL
);