data <- read_rds('../../temp/data_reg.rds')
# Just to be sure
data %<>%
distinct(AID, year, .keep_all = TRUE) %>%
relocate(AID, year, type, transition, transited_t, seniority)
# Decide what to lag
data %<>%
arrange(AID, year) %>%
group_by(AID) %>%
mutate(
across(c(concepts,
DL,
paper_n,
citation_n,
author_mean,
oa_mean,
novelty_mean,
deg_cen,
deg_cen_comp,
novelty_fw,
citation_fw,
aff_citation_fw,
aff_novelty_fw
), ~ lag(.x, 1))
) %>%
ungroup()
# Some last preprocessing
data %<>%
filter(year >= 2000,
year <= 2021) %>%
mutate(co_author_industry = deg_cen_comp > 0) %>% #,year_min = ifelse(year_min < 2000, 2000, year_min))
group_by(AID) %>%
mutate(t = 1:n(),
t_max = n()) %>%
ungroup()
# Check missing data
data %>% naniar::gg_miss_var()
# Replace again missing
data %<>%
group_by(AID) %>%
arrange(AID, year) %>%
fill(DL, concepts,
aff_id, aff_type, aff_novelty_fw, aff_citation_fw,
novelty_fw, citation_fw, paper_n, oa_mean, novelty_mean, deg_cen_comp, deg_cen, # IS not really correct, but can be done. Consider
co_author_industry, citation_n, author_mean, # IS not really correct, but can be done. Consider
.direction = "downup") %>%
ungroup()
# Replace NA. All others are by construction
data %<>%
replace_na(list(DL = 0, concepts = 'unknown'))
data %>% count(concepts, sort = TRUE) %>%
mutate(rank_pct = percent_rank(n))
data_surv <- data %>%
# Filter for last observation (=right cernsoring or transition)
filter(is.na(transited_t) | transited_t <= 1) %>%
group_by(AID) %>%
slice_max(order_by = t, n= 1, with_ties = FALSE) %>%
ungroup() %>%
# Add some variables
mutate(auth_star = citation_fw >= 0.9,
aff_star = aff_citation_fw >= 0.9,
)
# Previous filtering approach:
# filter( (type == 'education' & year == year_max) | (type == 'switcher' & transition == 1)) %>%
data_surv %>% count(type)
# Surf object
surv_object <- Surv(time = data_surv$t, event = data_surv$transition)
table(surv_object)
surv_object
1 2 3 4 4+ 5 5+ 6 6+ 7 7+ 8 8+ 9 9+ 10 10+ 11 11+ 12 12+ 13 13+ 14 14+ 15
2 71 97 117 17504 110 13583 128 10859 95 8588 63 7404 51 6341 33 5430 19 5214 27 4649 16 4068 13 3409 3
15+ 16 16+ 17 17+ 18 18+ 19 19+ 20 20+ 21 21+ 22+
3012 7 2689 3 2276 8 1917 4 1739 5 1610 9 1547 2576
fit_km0 <- survfit(surv_object ~ 1,
data = data_surv)
fit_km1 <- survfit(surv_object ~ DL,
data = data_surv)
fit_km2 <- survfit(surv_object ~ auth_star,
data = data_surv)
fit_km3 <- survfit(surv_object ~ aff_star,
data = data_surv)
p1 <- fit_km0 %>% autoplot() + labs(title= 'KM: baseline')
p2 <- fit_km1 %>% autoplot() + labs(title= 'KM: DL researcher')
p3 <- fit_km2 %>% autoplot() + labs(title= 'KM: Star researcher')
p4 <- fit_km3 %>% autoplot() + labs(title= 'KM: Star institution')
(p1 + p2) / (p3 + p4) + plot_layout(guides = "collect") & theme(legend.position = 'bottom') & labs(x = 'Time', y = 'Survival Probability')
rm(fit_km0, fit_km1, fit_km2, fit_km3,
p1, p2, p3, p4)
### Fit Cox proportional hazard model
set.seed(1337)
#Baseline
fit_cox0 <- coxph(surv_object ~ seniority + concepts + year,
data = data_surv)
Warning: Loglik converged before variable 13,31,38,40,43,48,51,53,54,56,58,60,61,62,64,67,70,72,76,78,79,85,87,89,90,94,95,96,98,99,101 ; coefficient may be infinite.
fit_cox1 <- coxph(surv_object ~ seniority + concepts + year
+ DL,
data = data_surv)
Warning: Loglik converged before variable 13,31,38,40,43,48,51,53,54,56,58,60,61,62,64,67,70,72,76,78,79,85,87,89,90,94,95,96,98,99,101 ; coefficient may be infinite.
fit_cox2 <- coxph(surv_object ~ seniority + concepts + year
+ deg_cen + deg_cen_comp,
data = data_surv)
Warning: Loglik converged before variable 55 ; coefficient may be infinite.
fit_cox3 <- coxph(surv_object ~ seniority + concepts + year
+ citation_fw + novelty_fw,
data = data_surv)
Warning: Loglik converged before variable 13,31,38,40,48,51,53,54,56,58,60,61,62,67,70,72,76,78,79,85,89,90,94,95,96,98,99,101 ; coefficient may be infinite.
fit_cox4 <- coxph(surv_object ~ seniority + concepts + year
+ aff_citation_fw,
data = data_surv)
Warning: Loglik converged before variable 13,31,38,40,43,48,51,53,54,56,58,60,61,62,64,67,70,72,76,78,79,85,87,89,90,94,95,96,98,99,101 ; coefficient may be infinite.
fit_cox5 <- coxph(surv_object ~ seniority + concepts + year
+ DL + deg_cen + deg_cen_comp + citation_fw + novelty_fw + aff_citation_fw,
data = data_surv)
#stargazer(fit_cox0, fit_cox1, fit_cox2, fit_cox3, fit_cox4, fit_cox5, type = 'latex',
# omit = c( 'concepts', 'year', 'Constant'), out ='../../output/surv_res1.tex')
#stargazer(fit_cox0, fit_cox1, fit_cox2, fit_cox3, fit_cox4, fit_cox5, type = 'text', omit = c( 'concepts', 'year', 'Constant'))
stargazer(fit_cox0, fit_cox1, fit_cox2, fit_cox3, fit_cox4, fit_cox5, type = 'html', omit = c( 'concepts', 'year', 'Constant'))
Warning: length of NULL cannot be changedWarning: length of NULL cannot
be changedWarning: length of NULL cannot be changedWarning: length of
NULL cannot be changedWarning: length of NULL cannot be changedWarning:
number of rows of result is not a multiple of vector length (arg
2)Warning: number of rows of result is not a multiple of vector length
(arg 2)Warning: number of rows of result is not a multiple of vector
length (arg 2)Warning: number of rows of result is not a multiple of
vector length (arg 2)Warning: number of rows of result is not a multiple
of vector length (arg 2)
| Dependent variable: | ||||||
| surv_object | ||||||
| (1) | (2) | (3) | (4) | (5) | (6) | |
| seniority | -0.232*** | -0.226*** | -0.239*** | -0.213*** | -0.220*** | -0.209*** |
| (0.011) | (0.011) | (0.011) | (0.011) | (0.011) | (0.011) | |
| DL | 1.349*** | 0.692*** | ||||
| (0.076) | (0.081) | |||||
| deg_cen | 0.190*** | -0.308*** | ||||
| (0.028) | (0.047) | |||||
| deg_cen_comp | 0.129*** | 0.127*** | ||||
| (0.007) | (0.007) | |||||
| citation_fw | 4.261*** | 4.181*** | ||||
| (0.130) | (0.156) | |||||
| novelty_fw | -0.876*** | -0.880*** | ||||
| (0.157) | (0.159) | |||||
| aff_citation_fw | 5.379*** | 0.292 | ||||
| (0.275) | (0.326) | |||||
| Observations | 105,296 | 105,296 | 105,296 | 105,296 | 105,296 | 105,296 |
| R2 | 0.017 | 0.020 | 0.020 | 0.030 | 0.020 | 0.033 |
| Max. Possible R2 | 0.169 | 0.169 | 0.169 | 0.169 | 0.169 | 0.169 |
| Log Likelihood | -8,867.794 | -8,709.206 | -8,684.612 | -8,173.723 | -8,697.137 | -8,017.109 |
| Wald Test | 262.910*** (df = 99) | 597.870*** (df = 100) | 2,188.910*** (df = 70) | 1,383.660*** (df = 98) | 642.810*** (df = 100) | 3,284.940*** (df = 74) |
| LR Test | 1,781.754*** (df = 99) | 2,098.929*** (df = 100) | 2,148.117*** (df = 70) | 3,169.896*** (df = 98) | 2,123.067*** (df = 100) | 3,483.124*** (df = 74) |
| Score (Logrank) Test | 1,560.421*** (df = 99) | 1,926.216*** (df = 100) | 7,626.015*** (df = 70) | 3,211.857*** (df = 98) | 1,954.019*** (df = 100) | 8,859.206*** (df = 74) |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
#fit_cox5 %>% ggforest(data = data_surv)
#save models
list(fit_cox0, fit_cox1, fit_cox2, fit_cox3, fit_cox4, fit_cox5) %>% write_rds('../../temp/reg_surv1.rds')
#rm(fit_cox0, fit_cox1, fit_cox2, fit_cox3, fit_cox4, fit_cox5)
library(MatchIt)
data_match <- data %>%
filter(type == 'education' | type == 'switcher' & transition == 1) %>%
left_join(read_rds('../../temp/author_concepts_main.rds') %>% select(AID, concepts) %>% rename(concept_main = concepts), by = 'AID') %>%
select(-transition, -transited_t, -year_transit) %>%
mutate(type = (type %>% factor() %>% as.numeric()) -1,
aff_id = aff_id %>% factor(),
concepts = concepts %>% factor(),
concept_main = concept_main %>% factor()) %>%
drop_na()
# Do the PSM
set.seed(1337)
match_out <- data_match %>%
# Matching
matchit(type ~ paper_n + author_mean + oa_mean + DL + t + t_max + co_author_industry + aff_citation_fw + aff_citation_fw, exact = c('year', 'seniority', 'concept_main'),
data = ., verbose = FALSE, method = "nearest", ratio = 1) # OR method "genetic" ... takes forever.....
# GEt the matched pairs
matched_pairs <- bind_cols(data_match[row.names(match_out$match.matrix),"AID"] , data_match[match_out$match.matrix,"AID"] ) %>%
rename(orig_AID = `AID...1`,
match_AID = `AID...2`)
New names:
# construct the matched dataset
data_did <- matched_pairs %>%
mutate(pair_id = 1:n())
data_did <- data_did %>%
select(orig_AID, pair_id) %>%
rename(AID = orig_AID) %>%
bind_rows(data_did %>%
select(match_AID, pair_id) %>%
rename(AID = match_AID) ) %>%
arrange(pair_id) %>%
left_join(data, by = 'AID')
# compute transition point for matched partner
data_did %<>%
group_by(pair_id) %>%
mutate(switcher = type == 'switcher',
year_transit = max(year_transit, na.rm = TRUE),
transition = year == year_transit,
transited = year >= year_transit,
transited_t = year - year_transit + 1) %>%
mutate(across(c(year, concepts, transited), ~ factor(.x))) %>%
mutate(transited_t = ifelse(transited_t > 0, transited_t, 0)) %>%
drop_na()
data_did %>% saveRDS('../temp_reg/reg_data_did.rds')
# Delete all objects
rm(list = ls(all.names = TRUE)) #will clear all objects includes hidden objects.
gc() #free up memrory and report the memory usage.
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 3874576 207.0 12010248 641.5 NA 12010248 641.5
Vcells 23545267 179.7 5445164727 41543.4 65536 8507759093 64909.1
data_did <- read_rds('../temp_reg/reg_data_did.rds')
# For the interaction plot code logical to numeric
# data_did %<>% mutate(switcher = switcher %>% as.numeric(), transited = transited %>% as.numeric())
data_did %>%
count(concepts, sort = TRUE) %>%
mutate(rank_pct = percent_rank(n))
# fit_did_0 <- lm(citation_rank ~ switcher + field_of_study_name, data = data_did)
fit_did_1 <- lm(citation_fw ~ (type + transited)^2 + year + seniority + year, # + concepts,
data = data_did)
fit_did_2 <- lm(citation_fw ~ (type + transited)^2 + (type + transited_t)^2 + year + seniority, # + concepts,
data = data_did)
fit_did_3 <- lm(novelty_fw ~ (type + transited)^2 + year + seniority + year, # + concepts,
data = data_did)
fit_did_4 <- lm(novelty_fw ~ (type + transited)^2 + (type + transited_t)^2 + year + seniority, # + concepts ,
data = data_did)
# + paper_n ?
#stargazer(fit_did_1, fit_did_2, fit_did_3, fit_did_4, type = 'latex', omit = c( 'concepts', 'year', 'Constant'), out ='../../output/did_res1.tex')
stargazer(fit_did_1, fit_did_2, fit_did_3, fit_did_4, type = 'text', omit = c('year', 'Constant'))
Warning: length of NULL cannot be changedWarning: length of NULL cannot be changedWarning: length of NULL cannot be changedWarning: length of NULL cannot be changedWarning: length of NULL cannot be changedWarning: number of rows of result is not a multiple of vector length (arg 2)Warning: number of rows of result is not a multiple of vector length (arg 2)
========================================================================================================================================
Dependent variable:
---------------------------------------------------------------------------------------------------------------
citation_fw novelty_fw
(1) (2) (3) (4)
----------------------------------------------------------------------------------------------------------------------------------------
typeswitcher 0.050*** 0.051*** 0.007 0.007
(0.007) (0.007) (0.005) (0.005)
transited 0.123*** 0.203*** 0.101*** 0.112***
(0.009) (0.011) (0.005) (0.007)
transited_t -0.021*** -0.003***
(0.002) (0.001)
seniority 0.004*** 0.007*** 0.006*** 0.006***
(0.0004) (0.0004) (0.0002) (0.0002)
typeswitcher:transited 0.114*** 0.172*** -0.020*** -0.010
(0.011) (0.014) (0.007) (0.009)
typeswitcher:transited_t -0.015*** -0.003*
(0.002) (0.001)
----------------------------------------------------------------------------------------------------------------------------------------
Observations 17,151 17,151 17,151 17,151
R2 0.165 0.195 0.142 0.143
Adjusted R2 0.164 0.194 0.140 0.142
Residual Std. Error 0.349 (df = 17125) 0.343 (df = 17123) 0.222 (df = 17125) 0.222 (df = 17123)
F Statistic 135.153*** (df = 25; 17125) 153.683*** (df = 27; 17123) 113.069*** (df = 25; 17125) 106.066*** (df = 27; 17123)
========================================================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(fit_did_1, fit_did_2, fit_did_3, fit_did_4, type = 'html', omit = c('year', 'Constant'))
Warning: length of NULL cannot be changedWarning: length of NULL cannot
be changedWarning: length of NULL cannot be changedWarning: length of
NULL cannot be changedWarning: length of NULL cannot be changedWarning:
number of rows of result is not a multiple of vector length (arg
2)Warning: number of rows of result is not a multiple of vector length
(arg 2)
| Dependent variable: | ||||
| citation_fw | novelty_fw | |||
| (1) | (2) | (3) | (4) | |
| typeswitcher | 0.050*** | 0.051*** | 0.007 | 0.007 |
| (0.007) | (0.007) | (0.005) | (0.005) | |
| transited | 0.123*** | 0.203*** | 0.101*** | 0.112*** |
| (0.009) | (0.011) | (0.005) | (0.007) | |
| transited_t | -0.021*** | -0.003*** | ||
| (0.002) | (0.001) | |||
| seniority | 0.004*** | 0.007*** | 0.006*** | 0.006*** |
| (0.0004) | (0.0004) | (0.0002) | (0.0002) | |
| typeswitcher:transited | 0.114*** | 0.172*** | -0.020*** | -0.010 |
| (0.011) | (0.014) | (0.007) | (0.009) | |
| typeswitcher:transited_t | -0.015*** | -0.003* | ||
| (0.002) | (0.001) | |||
| Observations | 17,151 | 17,151 | 17,151 | 17,151 |
| R2 | 0.165 | 0.195 | 0.142 | 0.143 |
| Adjusted R2 | 0.164 | 0.194 | 0.140 | 0.142 |
| Residual Std. Error | 0.349 (df = 17125) | 0.343 (df = 17123) | 0.222 (df = 17125) | 0.222 (df = 17123) |
| F Statistic | 135.153*** (df = 25; 17125) | 153.683*** (df = 27; 17123) | 113.069*** (df = 25; 17125) | 106.066*** (df = 27; 17123) |
| Note: | p<0.1; p<0.05; p<0.01 | |||
#library(interplot)
#fit_did_2 %>% interplot(var1 ="switcher", var2 = "transited_t", hist = TRUE)
library(interactions)
fit_did_2 %>% interact_plot(pred = transited_t, modx = switcher,
interval = TRUE) +
theme(legend.position = 'bottom')
ggsave('../../output/fig_interaction_1.png', width = 10, height = 7.5, units = 'cm')
library(interactions)
fit_did_2 %>% interact_plot(pred = transited_t, modx = switcher,
interval = TRUE, linearity.check = TRUE)
—>
data %>% select(seniority, deg_cen, deg_cen_comp, co_author_industry, DL, paper_n, author_mean, citation_fw, novelty_fw, aff_citation_fw, aff_novelty_fw) %>% as.data.frame() %>%
stargazer(summary = TRUE, type = 'html')
data %>%
select(type, seniority, deg_cen, deg_cen_comp, co_author_industry, DL, paper_n, author_mean, citation_fw, novelty_fw, aff_citation_fw, aff_novelty_fw) %>%
as.data.frame() %>%
split(.$type) %>%
walk(~ stargazer(., summary = TRUE, type = 'html'))
data %>%
filter(transited_t <= 0) %>%
mutate(auth_star = citation_fw >= 0.9,
aff_star = aff_citation_fw >= 0.9)
–>