-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsources.yml
156 lines (154 loc) · 7.03 KB
/
sources.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
version: 2
sources:
- name: open_alex_snapshot
schema: main
description: "Latest OpenAlex data snapshot"
meta:
external_location: read_ndjson_auto('./march_2024_snapshot/data/authors/updated_date=2023-07-*/*.gz')
tables:
- name: raw_authors
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/authors/*/*.gz',
columns = {
id: 'VARCHAR',
orcid: 'VARCHAR',
display_name: 'VARCHAR',
display_name_alternatives: 'VARCHAR[]',
works_count: 'BIGINT',
cited_by_count: 'BIGINT',
most_cited_work: 'VARCHAR',
ids: 'STRUCT(openalex VARCHAR, orcid VARCHAR, scopus VARCHAR)',
last_known_institution: 'STRUCT(id VARCHAR, ror VARCHAR, display_name VARCHAR, country_code VARCHAR, type VARCHAR, lineage VARCHAR[])',
counts_by_year: 'STRUCT(year VARCHAR, works_count BIGINT, oa_works_count BIGINT, cited_by_count BIGINT)[]',
works_api_url: 'VARCHAR',
updated_date: 'DATE',
created_date: 'DATE',
updated: 'VARCHAR'
})
formatter: oldstyle
- name: raw_concepts
meta:
external_location: read_ndjson_auto('/Volumes/8TB 1/march_2023_snapshot/data/concepts/*/*.gz')
- name: raw_domains
meta:
external_location: read_ndjson_auto('/Volumes/8TB 1/march_2024_snapshot/domains/*/*.gz')
- name: raw_fields
meta:
external_location: read_ndjson_auto('/Volumes/8TB 1/march_2024_snapshot/fields/*/*.gz')
- name: raw_funders
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/funders/*/*.gz',
columns = {
id: 'VARCHAR',
display_name: 'VARCHAR',
ids: 'STRUCT(openalex VARCHAR, wikidata VARCHAR, crossref BIGINT, doi VARCHAR, ror VARCHAR)',
country_code: 'VARCHAR',
description: 'VARCHAR',
roles: 'STRUCT(role VARCHAR, id VARCHAR, works_count BIGINT)[]',
works_count: 'BIGINT',
cited_by_count: 'BIGINT',
updated_date: 'DATE',
created_date: 'DATE',
updated: 'VARCHAR'
})
formatter: oldstyle
- name: raw_institutions
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/institutions/*/*.gz',
columns = {
id: 'VARCHAR',
ror: 'VARCHAR',
display_name: 'VARCHAR',
country_code: 'VARCHAR',
type: 'VARCHAR',
homepage_url: 'VARCHAR',
image_url: 'VARCHAR',
image_thumbnail_url: 'VARCHAR',
display_name_acronyms: 'VARCHAR[]',
display_name_alternatives: 'VARCHAR[]',
works_count: 'BIGINT',
cited_by_count: 'BIGINT',
ids: 'STRUCT(openalex VARCHAR, ror VARCHAR, grid VARCHAR, wikipedia VARCHAR, wikidata VARCHAR, mag BIGINT)',
geo: 'STRUCT(city VARCHAR, geonames_city_id VARCHAR, region VARCHAR, country_code VARCHAR, country VARCHAR, latitude DOUBLE, longitude DOUBLE)',
associated_institutions: 'STRUCT(id VARCHAR, ror VARCHAR, display_name VARCHAR, country_code VARCHAR, "type" VARCHAR, relationship VARCHAR, lineage VARCHAR[])[]',
counts_by_year: 'STRUCT(year VARCHAR, works_count BIGINT, oa_works_count BIGINT, cited_by_count BIGINT)[]',
works_api_url: 'VARCHAR',
updated_date: 'DATE'
})
formatter: oldstyle
- name: raw_publishers
meta:
external_location: read_ndjson_auto('/Volumes/8TB 1/march_2024_snapshot/publishers/*/*.gz')
- name: raw_sources
meta:
external_location: read_ndjson_auto('/Volumes/8TB 1/march_2024_snapshot/sources/*/*.gz')
- name: raw_subfields
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/subfields/*/*.gz',
columns = {
id: 'VARCHAR',
display_name: 'VARCHAR',
description: 'VARCHAR',
display_name_alternatives: 'VARCHAR[]',
ids: 'STRUCT(wikidata VARCHAR, wikipedia VARCHAR)',
works_count: 'BIGINT',
cited_by_count: 'BIGINT',
field: 'STRUCT(id VARCHAR, display_name VARCHAR)',
domain: 'STRUCT(id VARCHAR, display_name VARCHAR)',
updated_date: 'DATE',
created_date: 'DATE',
updated: 'VARCHAR'
})
formatter: oldstyle
- name: raw_topics
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/topics/*/*.gz',
columns = {
id: 'VARCHAR',
display_name: 'VARCHAR',
subfield: 'STRUCT(id VARCHAR, display_name VARCHAR)',
field: 'STRUCT(id VARCHAR, display_name VARCHAR)',
domain: 'STRUCT(id VARCHAR, display_name VARCHAR)',
description: 'VARCHAR',
keywords: 'VARCHAR[]',
ids: 'STRUCT(openalex VARCHAR, wikipedia VARCHAR)',
updated_date: 'DATE',
created_date: 'DATE',
updated: 'VARCHAR'
})
formatter: oldstyle
- name: raw_works
meta:
external_location: >
read_ndjson('/Volumes/8TB 1/march_2024_snapshot/works/*/*.gz',
columns = {
id: 'VARCHAR',
doi: 'VARCHAR',
title: 'VARCHAR',
display_name: 'VARCHAR',
publication_year: 'BIGINT',
publication_date: 'DATE',
primary_location: 'STRUCT(source STRUCT(id VARCHAR), pdf_url VARCHAR, landing_page_url VARCHAR, is_oa BOOLEAN, "version" VARCHAR, license VARCHAR)',
ids: 'STRUCT(openalex VARCHAR, doi VARCHAR, pmid VARCHAR, mag BIGINT, arxiv_id VARCHAR, pmcid VARCHAR)',
type: 'VARCHAR',
cited_by_count: 'BIGINT',
authorships: 'STRUCT(author_position VARCHAR, author STRUCT(id VARCHAR), institutions STRUCT(id VARCHAR, display_name VARCHAR, ror VARCHAR, country_code VARCHAR, type VARCHAR, lineage VARCHAR[])[])[]',
biblio: 'STRUCT(volume VARCHAR, issue VARCHAR, first_page VARCHAR, last_page VARCHAR)',
is_retracted: 'BOOLEAN',
is_paratext: 'BOOLEAN',
mesh: 'STRUCT(is_major_topic BOOLEAN, descriptor_ui VARCHAR, descriptor_name VARCHAR, qualifier_ui VARCHAR, qualifier_name VARCHAR)[]',
referenced_works: 'VARCHAR[]',
related_works: 'VARCHAR[]',
cited_by_api_url: 'VARCHAR',
language: 'VARCHAR',
updated_date: 'DATE',
topics: 'STRUCT(id VARCHAR, display_name VARCHAR, score DOUBLE)[]',
primary_topic: 'STRUCT(id VARCHAR, display_name VARCHAR, score DOUBLE)'
},
maximum_object_size = 20000000)
formatter: oldstyle