-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall_diabetes_cohort.R
499 lines (355 loc) · 24.4 KB
/
all_diabetes_cohort.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# Identify those in mixed T1/T2/'Other' (those with codes for other types of diabetes) diabetes cohort as per https://github.com/Exeter-Diabetes/CPRD-Codelists#diabetes-algorithms
# Add useful baseline features including diabetes diagnosis variables (date of diagnosis, type of diabetes for Type 1/Type 2) and ethnicity
# Uses other pre-made tables:
## validDateLookup has min_dob (earliest possible DOB), and gp_end_date (earliest of last collection date from practice and deregistration)
## all_patid_ethnicity, from all_patid_ethnicity.R script as per https://github.com/Exeter-Diabetes/CPRD-Codelists#ethnicity
############################################################################################
#Setup
library(tidyverse)
library(aurum)
library(EHRBiomarkr)
rm(list=ls())
cprd = CPRDData$new(cprdEnv = "diabetes-jun2024",cprdConf = "~/.aurum.yaml")
codesets = cprd$codesets()
codes_2024 = codesets$getAllCodeSetVersion(v = "01/06/2024")
#Check codelists
current_list <- codesets$listCodeSets()
############################################################################################
#Data quality check - should only include acceptable' patients (see CPRD data specification for definition)
cprd$tables$patient %>% count() #2727999 - total patient count in download
cprd$tables$patient %>% filter(acceptable ==1) %>% count() #2727999
cprd$tables$patient %>% filter(patienttypeid ==3) %>% count() #2727999
#All are 'acceptable' and have patienttypeid==3 ('Regular')
############################################################################################
##CPRD recommend excluding 44 practices (as below) that appear likely to have merged into other contributing practices (patient data could be duplicated)
##Define patients to remove later
analysis = cprd$analysis("diabetes_cohort")
practice_exclusion_ids <- cprd$tables$patient %>%
filter(pracid == "20024" | pracid == "20036" |pracid == "20091" |pracid == "20171" | pracid == "20178" |pracid == "20202" | pracid == "20254" | pracid == "20389" |pracid == "20430" |pracid == "20452" |
pracid == "20469" | pracid == "20487" | pracid == "20552" | pracid == "20554" | pracid == "20640" | pracid == "20717" | pracid == "20734" | pracid == "20737" | pracid == "20740" | pracid == "20790" |
pracid == "20803" | pracid == "20822" | pracid == "20868" | pracid == "20912" | pracid == "20996" | pracid == "21001" | pracid == "21015" | pracid == "21078" | pracid == "21112" | pracid == "21118" |
pracid == "21172" | pracid == "21173" | pracid == "21277" | pracid == "21281" | pracid == "21331" | pracid == "21334" | pracid == "21390" | pracid == "21430" | pracid == "21444" | pracid == "21451" |
pracid == "21529" | pracid == "21553" | pracid == "21558" | pracid == "21585") %>%
analysis$cached("practice_exclusion_ids")
practice_exclusion_ids %>% count() #30759
############################################################################################
##Define patients with gender=3 (indeterminate) to remove later
gender_exclusion_ids <- cprd$tables$patient %>%
filter(gender==3) %>%
analysis$cached("gender_exclusion_ids")
gender_exclusion_ids %>% count() #43
cprd$tables$patient %>% anti_join(practice_exclusion_ids, by="patid") %>% anti_join(gender_exclusion_ids, by="patid") %>% count() #2697197
############################################################################################
# Define diabetes cohort (has diabetes QOF code with valid date)
## QOF codelist uses Read codes from version 38 (contains all codes in previous versions and was last Read version published)
# SNOMED codes uses v44 plus extra codes from v46/47 (they are identical to each other; 6 codes added) and extra codes from v48 (9 more not in v46/47)
# Includes QOF codes for non-T1/T2 types of diabetes (NB: no gestational diabetes QOF codes)
analysis = cprd$analysis("all_patid")
raw_qof_diabetes_medcodes <- cprd$tables$observation %>%
inner_join(codes_2024$qof_diabetes, by="medcodeid") %>%
analysis$cached("raw_qof_diabetes_medcodes", indexes=c("patid", "obsdate", "qof_diabetes_cat"))
analysis = cprd$analysis("diabetes_cohort")
qof_ids <- raw_qof_diabetes_medcodes %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(obsdate>=min_dob & obsdate<=gp_end_date) %>%
distinct(patid) %>%
analysis$cached("qof_ids", unique_indexes="patid")
qof_ids %>% count() #2137263
qof_ids %>% anti_join(practice_exclusion_ids, by="patid") %>% anti_join(gender_exclusion_ids, by="patid") %>% count() #2110415
############################################################################################
# Exclude diabetes insipidus or gestational diabetes ever
analysis = cprd$analysis("all_patid")
raw_diabetes_insipidus_medcodes <- cprd$tables$observation %>%
inner_join(codes_2024$diabetes_insipidus, by="medcodeid") %>%
analysis$cached("raw_diabetes_insipidus_medcodes", indexes=c("patid", "obsdate"))
analysis = cprd$analysis("diabetes_cohort")
insipidus_ids <- raw_diabetes_insipidus_medcodes %>%
distinct(patid) %>%
analysis$cached("insipidus_ids", unique_indexes="patid")
insipidus_ids %>% count() #1800
qof_ids %>% anti_join(practice_exclusion_ids, by="patid") %>% anti_join(gender_exclusion_ids, by="patid") %>% inner_join(insipidus_ids, by="patid") %>% count() #1300
gestational_ids <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat=="gestational" | all_diabetes_cat == "gestational history") %>%
distinct(patid) %>%
analysis$cached("gestational_ids", unique_indexes="patid")
gestational_ids %>% count() #145147
qof_ids %>% anti_join(practice_exclusion_ids, by="patid") %>% anti_join(gender_exclusion_ids, by="patid") %>% inner_join(gestational_ids, by="patid") %>% count() #28053
#Combine all of above
diabetes_cohort_ids <- qof_ids %>%
anti_join(practice_exclusion_ids, by="patid") %>%
anti_join(gender_exclusion_ids, by="patid") %>%
anti_join(insipidus_ids, by="patid") %>%
anti_join(gestational_ids, by="patid") %>%
analysis$cached("ids", unique_indexes="patid")
diabetes_cohort_ids %>% count() #2081081
############################################################################################
# Diagnosis dates
# Earliest of: any diabetes medcode (excluding obstype==4 [family history]), HbA1c >=47.5mmol/mol, OHA script, insulin script (all - valid dates only)
## If Type 2 (determined later in this script), ignore any diabetes medcodes in year of birth - use next code/HbA1c/script
## If Type 2 and have high HbA1c or OHA/insulin script in year of birth, will exclude later
## Similarly, if Type 2 and ONLY have diabetes medcodes in year of birth (and no high HbA1cs or OHA/insulin scripts later), will exclude later
## All diabetes medcodes, OHA scripts and insulin scripts also needed for defining diabetes type - so cache these
## Have also cached HbA1c as needed for later analysis
analysis = cprd$analysis("all_patid")
## All diabetes medcodes (need for diagnosis date (cleaned) and for defining Type 1 vs Type 2 (raw))
raw_diabetes_medcodes <- cprd$tables$observation %>%
inner_join(codes_2024$all_diabetes, by="medcodeid") %>%
analysis$cached("raw_diabetes_medcodes", indexes=c("patid", "obsdate", "all_diabetes_cat"))
## All HbA1cs
### Remove if <1990 and assume in % and convert to mmol/mol if <=20 (https://github.com/Exeter-Diabetes/CPRD-Codelists#hba1c)
raw_hba1c <- cprd$tables$observation %>%
inner_join(codes_2024$hba1c, by="medcodeid") %>%
analysis$cached("raw_hba1c_medcodes", indexes=c("patid", "obsdate", "testvalue", "numunitid"))
clean_hba1c <- raw_hba1c %>%
filter(year(obsdate)>=1990) %>%
mutate(testvalue=ifelse(testvalue<=20, ((testvalue-2.152)/0.09148), testvalue)) %>%
clean_biomarker_values(testvalue, "hba1c") %>%
clean_biomarker_units(numunitid, "hba1c") %>%
group_by(patid, obsdate) %>%
summarise(testvalue=mean(testvalue, na.rm=TRUE)) %>%
ungroup() %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(obsdate>=min_dob & obsdate<=gp_end_date) %>%
select(patid, date=obsdate, testvalue) %>%
analysis$cached("clean_hba1c_medcodes", indexes=c("patid", "date", "testvalue"))
## All OHA scripts (need for diagnosis date (cleaned) and definition (cleaned))
raw_oha <- cprd$tables$drugIssue %>%
inner_join(cprd$tables$ohaLookup, by="prodcodeid") %>%
analysis$cached("raw_oha_prodcodes", indexes=c("patid", "issuedate"))
clean_oha <- raw_oha %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(issuedate>=min_dob & issuedate<=gp_end_date) %>%
select(patid, date=issuedate, dosageid, quantity, quantunitid, duration, drug_class_1, drug_class_2, drug_substance_1, drug_substance_2) %>%
analysis$cached("clean_oha_prodcodes", indexes=c("patid", "date", "drug_class_1", "drug_class_2", "drug_substance_1", "drug_substance_2"))
## All insulin scripts (need for diagnosis date (cleaned) and definition (cleaned))
raw_insulin <- cprd$tables$drugIssue %>%
inner_join(codes_2024$insulin, by="prodcodeid") %>%
analysis$cached("raw_insulin_prodcodes", indexes=c("patid", "issuedate"))
clean_insulin <- raw_insulin %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(issuedate>=min_dob & issuedate<=gp_end_date) %>%
select(patid, date=issuedate, dosageid, quantity, quantunitid, duration, insulin_cat) %>%
analysis$cached("clean_insulin_prodcodes", indexes=c("patid", "date", "insulin_cat"))
# Cleaning required for diagnosis dates
## If construct query this way (finding earliest date from each of the diagnosis codes, HbA1cs and prescriptions, and then combining, rather than combining all dates and then finding earliest date, doesn't give error about disk space)
analysis = cprd$analysis("diabetes_cohort")
## Earliest clean (i.e. with valid date) non-family history diabetes medcode, excluding diabetes in pregnancy
first_diagnosis_dm_code <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat != "pregnancy") %>%
select(patid, obsdate, obstypeid) %>%
filter(obstypeid!=4) %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(obsdate>=min_dob & obsdate<=gp_end_date) %>%
group_by(patid) %>%
summarise(dm_diag_dmcodedate=min(obsdate, na.rm=TRUE)) %>%
analysis$cached("first_diagnosis_dm_code", unique_index="patid")
## Earliest clean (i.e. with valid date) non-family history diabetes medcode, excluding diabetes in pregnancy, excluding those in year of birth
first_diagnosis_dm_code_post_yob <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat != "pregnancy") %>%
select(patid, obsdate, obstypeid) %>%
filter(obstypeid!=4) %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(year(obsdate)>year(min_dob) & obsdate<=gp_end_date) %>%
group_by(patid) %>%
summarise(dm_diag_dmcodedate_post_yob=min(obsdate, na.rm=TRUE)) %>%
analysis$cached("first_diagnosis_dm_code_post_yob", unique_index="patid")
## Earliest (clean) HbA1c >=48 mmol/mol
first_high_hba1c <- clean_hba1c %>%
filter(testvalue>47.5) %>%
group_by(patid) %>%
summarise(dm_diag_hba1cdate=min(date, na.rm=TRUE)) %>%
analysis$cached("first_high_hba1c", unique_index="patid")
## Earliest (clean) OHA script
first_oha <- clean_oha %>%
group_by(patid) %>%
summarise(dm_diag_ohadate=min(date, na.rm=TRUE)) %>%
analysis$cached("first_oha", unique_index="patid")
## Earliest (clean) insulin script
first_insulin <- clean_insulin %>%
group_by(patid) %>%
summarise(dm_diag_insdate=min(date, na.rm=TRUE)) %>%
analysis$cached("first_insulin", unique_index="patid")
# Calculate possible diagnosis dates (overall earliest of code/HbA1c/script, and earliest excluding codes in year of birth)
dm_diag_dates <- diabetes_cohort_ids %>%
left_join(first_diagnosis_dm_code, by="patid") %>%
left_join(first_diagnosis_dm_code_post_yob, by="patid") %>%
left_join(first_high_hba1c, by="patid") %>%
left_join(first_oha, by="patid") %>%
left_join(first_insulin, by="patid") %>%
mutate(dm_diag_date_all = pmin(ifelse(is.na(dm_diag_dmcodedate), as.Date("2050-01-01"), dm_diag_dmcodedate),
ifelse(is.na(dm_diag_hba1cdate), as.Date("2050-01-01"), dm_diag_hba1cdate),
ifelse(is.na(dm_diag_ohadate), as.Date("2050-01-01"), dm_diag_ohadate),
ifelse(is.na(dm_diag_insdate), as.Date("2050-01-01"), dm_diag_insdate), na.rm=TRUE),
dm_diag_date_all_post_yob = pmin(ifelse(is.na(dm_diag_dmcodedate_post_yob), as.Date("2050-01-01"), dm_diag_dmcodedate_post_yob),
ifelse(is.na(dm_diag_hba1cdate), as.Date("2050-01-01"), dm_diag_hba1cdate),
ifelse(is.na(dm_diag_ohadate), as.Date("2050-01-01"), dm_diag_ohadate),
ifelse(is.na(dm_diag_insdate), as.Date("2050-01-01"), dm_diag_insdate), na.rm=TRUE)) %>%
analysis$cached("dm_diag_dates", unique_indexes="patid", indexes=c("dm_diag_date_all", "dm_diag_date_all_post_yob"))
############################################################################################
# Define diabetes type: type 1 or type 2 or other
## If they have a non-T1/non-T2 code, define as 'other'
non_T1T2_ids <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat=="insulin receptor abs" |
all_diabetes_cat=="malnutrition" |
all_diabetes_cat=="mody" |
all_diabetes_cat=="other type not specified" |
all_diabetes_cat=="other/unspec genetic inc syndromic" |
all_diabetes_cat=="secondary") %>%
distinct(patid) %>%
analysis$cached("non_T1T2_ids", unique_indexes="patid")
non_T1T2_ids %>% count() #14505
non_T1T2_ids <- non_T1T2_ids %>% mutate(diabetes_type="other",
diabetes_type_post_yob="other")
## Make tables for variables required for Type 1 vs 2 definition (do for everyone)
### Whether or not have valid insulin prescription
has_insulin <- clean_insulin %>%
select(patid) %>%
distinct() %>%
mutate(has_insulin=1L) %>%
analysis$cached("has_insulin", unique_indexes="patid")
### Type 1-specific code count (any date)
type1_code_count <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat=="Type 1") %>%
group_by(patid) %>%
summarise(type1_code_count=n()) %>%
analysis$cached("type1_code_count", unique_indexes="patid")
### Type 2-specific code count (any date)
type2_code_count <- raw_diabetes_medcodes %>%
filter(all_diabetes_cat=="Type 2") %>%
group_by(patid) %>%
summarise(type2_code_count=n()) %>%
analysis$cached("type2_code_count", unique_indexes="patid")
### Age of diagnosis
#### First need to estimate DOB: earliest of any medcode in Observation table (besides those before mob/yob), or use the 15/mob/yob if mob where provided, or 30/06/yob if only yob provided
dob <- cprd$tables$observation %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter(obsdate>=min_dob) %>%
group_by(patid) %>%
summarise(earliest_medcode=min(obsdate, na.rm=TRUE)) %>%
ungroup() %>%
analysis$cached("earliest_medcode", unique_indexes="patid")
#### Check count
dob %>% count() #2727999 - everyone in download
#### No-one has missing dob or earliest_medcode so pmin (runs as 'LEAST' in MySQL) works
dob <- dob %>%
inner_join(cprd$tables$patient, by="patid") %>%
mutate(dob=as.Date(ifelse(is.na(mob), paste0(yob,"-06-30"), paste0(yob, "-",mob,"-15")))) %>%
inner_join(cprd$tables$validDateLookup, by = "patid") %>%
mutate(dob=pmin(dob, earliest_medcode, na.rm=TRUE)) %>%
mutate(dob=ifelse(regstartdate>=min_dob & regstartdate<dob, regstartdate, dob)) %>%
select(patid, dob = dob, mob, yob, regstartdate) %>%
analysis$cached("dob", unique_indexes="patid")
#### Calculate diagnosis age from dob and diagnosis date
#### Also add in whether diagnosis date < regstartdate as will need this for time to insulin
dm_diag_age <- dm_diag_dates %>%
left_join(dob, by="patid") %>%
mutate(dm_diag_age_all=(datediff(dm_diag_date_all, dob))/365.25,
dm_diag_age_all_post_yob=(datediff(dm_diag_date_all_post_yob, dob))/365.25,
dm_diag_before_reg=dm_diag_date_all<regstartdate,
dm_diag_before_reg_post_yob=dm_diag_date_all_post_yob<regstartdate) %>%
select(patid, dob, mob, yob, regstartdate, starts_with("dm_diag")) %>%
analysis$cached("dm_diag_age", unique_indexes="patid", indexes=c("dm_diag_date_all", "dm_diag_age_all"))
### Whether first insulin script within 1 year of diagnosis
#### Included even if prescription is before registration start
time_to_insulin <- clean_insulin %>%
group_by(patid) %>%
summarise(first_insulin=min(date, na.rm=TRUE)) %>%
ungroup() %>%
inner_join(dm_diag_dates, by="patid") %>%
mutate(ins_in_1_year=ifelse(datediff(first_insulin, dm_diag_date_all)/365.25<=1, 1L, NA),
ins_in_1_year_post_yob=ifelse(datediff(first_insulin, dm_diag_date_all_post_yob)/365.25<=1, 1L, NA)) %>%
select(patid, ins_in_1_year, ins_in_1_year_post_yob) %>%
analysis$cached("time_to_insulin", unique_indexes="patid")
### Current OHA status (whether have prescription for OHA in last 6 months of records)
current_oha <- clean_oha %>%
group_by(patid) %>%
summarise(latest_oha=max(date, na.rm=TRUE)) %>%
ungroup() %>%
inner_join(cprd$tables$validDateLookup, by="patid") %>%
filter((datediff(gp_end_date, latest_oha))/365.25<=0.5) %>%
mutate(current_oha=1L) %>%
select(patid, current_oha) %>%
analysis$cached("current_oha", unique_indexes="patid")
# Calculate diabetes type when include and exclude diabetes medcodes in year of birth (yob)
diabetes_type_prelim <- diabetes_cohort_ids %>%
left_join(non_T1T2_ids, by="patid") %>%
left_join(has_insulin, by="patid") %>%
left_join(type1_code_count, by="patid") %>%
left_join(type2_code_count, by="patid") %>%
inner_join(dm_diag_age, by="patid") %>%
left_join(time_to_insulin, by="patid") %>%
left_join(current_oha, by="patid") %>%
replace_na(list(has_insulin=0L, type1_code_count=0L, type2_code_count=0L, ins_in_1_year=0L, ins_in_1_year_post_yob=0L, current_oha=0L)) %>%
mutate(diabetes_type=ifelse(
is.na(diabetes_type) &
((has_insulin==1 & type1_code_count!=0 & type2_code_count!=0 & type1_code_count>=(2*type2_code_count)) |
(has_insulin==1 & type1_code_count!=0 & type2_code_count==0) |
(has_insulin==1 & type1_code_count==0 & type2_code_count==0 & dm_diag_age_all<35 & ins_in_1_year==1) |
(has_insulin==1 & type1_code_count==0 & type2_code_count==0 & dm_diag_age_all<35 & dm_diag_before_reg==1 & current_oha==0)), "type 1",
ifelse(is.na(diabetes_type), "type 2", diabetes_type)),
diabetes_type_post_yob=ifelse(
is.na(diabetes_type_post_yob) &
((has_insulin==1 & type1_code_count!=0 & type2_code_count!=0 & type1_code_count>=(2*type2_code_count)) |
(has_insulin==1 & type1_code_count!=0 & type2_code_count==0) |
(has_insulin==1 & type1_code_count==0 & type2_code_count==0 & dm_diag_age_all_post_yob<35 & ins_in_1_year_post_yob==1) |
(has_insulin==1 & type1_code_count==0 & type2_code_count==0 & dm_diag_age_all_post_yob<35 & dm_diag_before_reg_post_yob==1 & current_oha==0)), "type 1",
ifelse(is.na(diabetes_type_post_yob), "type 2", diabetes_type_post_yob))) %>%
analysis$cached("diabetes_type_prelim", unique_indexes="patid")
############################################################################################
##Checks
# Check those that have a different type of diabetes depending on whether include medcodes in year of birth or not
check <- collect(diabetes_type_prelim %>%
filter(diabetes_type!=diabetes_type_post_yob))
## 21 people: 20 are Type 1 if include codes in year of birth, and Type 2 otherwise, 1 is Type 2 if include codes in year of birth, and Type 1 otherwise as affects time to insulin
## These people are 'unclassifiable' - exclude
# Check those with T2D who still have diagnosis date in year of birth due to having script or high HbA1c in year of birth
check <- collect(diabetes_type_prelim %>%
filter(diabetes_type=="type 2" & diabetes_type_post_yob=="type 2" & year(dm_diag_date_all_post_yob)==yob))
## 15 people: 13 with OHA in yob, 2 with high HbA1c
## Exclude these people
# Check those with T2D who only have diabetes medcodes in year of birth and no later codes/HbA1cs/scripts
check <- collect(diabetes_type_prelim %>%
filter(diabetes_type=="type 2" & diabetes_type_post_yob=="type 2" & is.na(dm_diag_date_all_post_yob)))
# 0 people
# Finalise diabetes type and date of diagnosis - recode dm_diag_dmcodedate, dm_diag_date_all, dm_diag_age_all, dm_diag_before_reg and ins_in_1_year so include/exclude diabetes medcodes in year of birth depending on diabetes type
diabetes_type_final <- diabetes_type_prelim %>%
filter(!((diabetes_type=="type 2" & diabetes_type_post_yob=="type 2" & year(dm_diag_date_all_post_yob)==yob) | (diabetes_type!=diabetes_type_post_yob))) %>%
mutate(raw_dm_diag_dmcodedate=dm_diag_dmcodedate,
raw_dm_diag_date_all=dm_diag_date_all,
dm_diag_dmcodedate=ifelse(diabetes_type=="type 2", dm_diag_dmcodedate_post_yob, dm_diag_dmcodedate),
dm_diag_date_all=ifelse(diabetes_type=="type 2", dm_diag_date_all_post_yob, dm_diag_date_all),
dm_diag_age_all=ifelse(diabetes_type=="type 2", dm_diag_age_all_post_yob, dm_diag_age_all),
dm_diag_before_reg=ifelse(diabetes_type=="type 2", dm_diag_before_reg_post_yob, dm_diag_before_reg),
ins_in_1_year=ifelse(diabetes_type=="type 2", ins_in_1_year_post_yob, ins_in_1_year)) %>%
select(-c(ends_with("post_yob"))) %>%
analysis$cached("diabetes_type_final", unique_indexes="patid")
############################################################################################
# Remove unreliable diagnosis dates, add in other variables, and cache
## Set diagnosis date to missing if between -30 and +90 days (inclusive) of registration start
## Add dm_diag_date and dm_diag_age - missing if diagnosis date is before registration
## Add variable for what type of code diagnosis is based on
## Also add in gender, registration end, practice ID, and cprd_ddate from patient table
## Also add in last collection date for practice, and derive/keep: regstartdate, gp_end_date (earliest of last collection date from practice and deregistration - will have one or other - from r_valid_date_lookup), death_date (earliest of 'cprddeathdate' (derived by CPRD))
## Also add in ethnicity from all_patid_ethnicity table from all_patid_ethnicity.R script as per https://github.com/Exeter-Diabetes/CPRD-Codelists#ethnicity
analysis = cprd$analysis("all_patid")
ethnicity <- ethnicity %>% analysis$cached("ethnicity")
#Don't have linkage data yet - can't add with HES, ONS death dates, or IMD scores
analysis = cprd$analysis("all")
diabetes_cohort <- diabetes_type_final %>%
mutate(dm_diag_date_all=if_else(datediff(dm_diag_date_all, regstartdate)>=-30 & datediff(dm_diag_date_all, regstartdate)<=90, as.Date(NA), dm_diag_date_all),
dm_diag_date=if_else(dm_diag_date_all<regstartdate, as.Date(NA), dm_diag_date_all),
dm_diag_age=ifelse(dm_diag_date_all<regstartdate, NA, dm_diag_age_all),
dm_diag_codetype = ifelse(is.na(dm_diag_date_all), NA,
ifelse(!is.na(dm_diag_dmcodedate) & dm_diag_dmcodedate==dm_diag_date_all, 1L,
ifelse(!is.na(dm_diag_hba1cdate) & dm_diag_hba1cdate==dm_diag_date_all, 2L,
ifelse(!is.na(dm_diag_ohadate) & dm_diag_ohadate==dm_diag_date_all, 3L, 4L))))) %>%
left_join((cprd$tables$patient %>% select(patid, gender, regenddate, pracid, cprd_ddate)), by="patid") %>%
left_join((cprd$tables$practice %>% select(pracid, lcd, region)), by="pracid") %>%
left_join((cprd$tables$validDateLookup %>% select(patid, gp_end_date)), by="patid") %>%
left_join(ethnicity, by="patid") %>%
select(patid, gender, dob, pracid, prac_region=region, ethnicity_5cat, ethnicity_16cat, ethnicity_qrisk2, has_insulin, type1_code_count, type2_code_count, raw_dm_diag_dmcodedate, raw_dm_diag_date_all, dm_diag_dmcodedate, dm_diag_hba1cdate, dm_diag_ohadate, dm_diag_insdate, dm_diag_date_all, dm_diag_date, dm_diag_codetype, dm_diag_age_all, dm_diag_age, dm_diag_before_reg, ins_in_1_year, current_oha, diabetes_type, regstartdate, gp_end_date) %>%
analysis$cached("diabetes_cohort", unique_indexes="patid", indexes=c("gender", "dob", "dm_diag_date_all", "dm_diag_date", "dm_diag_age_all", "dm_diag_age", "diabetes_type"))
diabetes_cohort %>% count() # 2081045
diabetes_cohort %>% filter(diabetes_type == "type 2") %>% count() # 1916916
diabetes_cohort %>% filter(diabetes_type == "type 1") %>% count() # 153212
diabetes_cohort %>% filter(diabetes_type == "other") %>% count() # 10917