Load in Wordbank administration and item data.
data_mode = "local"
languages <- c("Norwegian", "English", "Danish", "Spanish")
admins <- get_administration_data(filter_age = FALSE, mode = data_mode) %>%
filter(form == "WS", language %in% languages) %>%
mutate(language = factor(language, levels = languages))
items <- get_item_data(mode = data_mode) %>%
filter(form == "WS", language %in% languages) %>%
mutate(language = factor(language, levels = languages),
num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))
Set up analysis-wide constants.
vocab_step <- 0.01
min_age <- 16
max_age <- 31
num_age_groups <- 2
Show number of items in each relevant section.
sections <- items %>%
filter(form == "WS") %>%
group_by(language, type) %>%
summarise(n = n()) %>%
spread(type, n) %>%
select(language, word, word_form, complexity)
sections[is.na(sections)] = 0
kable(sections)
language | word | word_form | complexity |
---|---|---|---|
Norwegian | 731 | 33 | 42 |
English | 680 | 25 | 37 |
Danish | 725 | 29 | 33 |
Spanish | 680 | 24 | 37 |
Show total number of administrations in each language.
n_admin <- admins %>%
group_by(language) %>%
summarise(n = n())
kable(n_admin)
language | n |
---|---|
Norwegian | 12969 |
English | 5824 |
Danish | 3714 |
Spanish | 1094 |
Show number of administrations in each language by age group.
n_admin_age <- admins %>%
filter(age >= min_age, age <= max_age) %>%
mutate(age_group = cut(age, breaks = seq(min_age - 1, max_age,
length = num_age_groups + 1))) %>%
group_by(language, age_group) %>%
summarise(n = n()) %>%
spread(age_group, n)
kable(n_admin_age)
language | (15,23] | (23,31] |
---|---|---|
Norwegian | 4128 | 5967 |
English | 2967 | 2628 |
Danish | 1582 | 1456 |
Spanish | 582 | 512 |
Some utility functions for transforming data values.
get_coded_type <- function(type, complexity_category) {
if (type == "complexity") {
return(complexity_category)
} else {
return(type)
}
}
get_value <- function(type, value) {
if (type == "word_form" | type == "word") {
return(value == "produces")
} else if (type == "complexity") {
return(value == "complex")
}
}
format_group <- function(group) {
paste(
map(strsplit(group, " - ")[[1]],
function(bin) {
paste0("(", paste(strsplit(bin, "\\.")[[1]][-1], collapse = ","), "]")
})
, collapse = "-")
}
Get kid by item data for wordform and complexity items all languages and aggregate them.
get_lang_grammar_data <- function(lang) {
lang_grammar_items <- items %>%
filter(language == lang, type %in% c("word_form", "complexity"))
lang_num_words <- nrow(filter(items, language == lang, type == "word"))
lang_admins <- admins %>%
filter(language == lang) %>%
mutate(vocab_prop = production / lang_num_words)
lang_grammar_data <- get_instrument_data(instrument_language = lang,
instrument_form = "WS",
items = lang_grammar_items$item_id,
administrations = lang_admins,
iteminfo = lang_grammar_items,
mode = data_mode) %>%
group_by(data_id, type) %>%
mutate(no_section = all(is.na(value))) %>%
filter(!no_section) %>%
mutate(value = ifelse(is.na(value), "", value),
value = get_value(unique(type), value),
coded_type = get_coded_type(unique(type), complexity_category),
coded_type = factor(coded_type,
levels = c("word_form", "morphology", "syntax"),
labels = c("Word Form",
"Complexity (Morphological)",
"Complexity (Syntactic)")),
measure = factor(type, levels = c("word_form", "complexity"),
labels = c("Word Form", "Complexity"))) %>%
ungroup() %>%
select(-complexity_category, -no_section, -type)
return(lang_grammar_data)
}
grammar_data <- languages %>%
map(get_lang_grammar_data) %>%
bind_rows() %>%
filter(age >= min_age, age <= max_age)
Get by kid summary data for all languages.
grammar_summary <- grammar_data %>%
group_by(language, measure, data_id, age, vocab_prop) %>%
summarise(num_true = sum(value),
num_false = n() - num_true,
prop = num_true / n())
Model comparison: fit various grammar models and compare their adjusted R-squared values.
rsq <- function(object) 1 - sum(residuals(object, type = "response") ^ 2) / sum(object$y ^ 2)
adj_rsq <- function(object) {
rsq <- rsq(object)
p <- summary(object)$df[1]
n_p <- summary(object)$df[2]
rsq - (1 - rsq) * (p / (n_p - 1))
}
models <- list(
"lm.linear" = function(data) lm(prop ~ vocab_prop * age - 1,
data, y = TRUE),
"lm.quadratic" = function(data) lm(prop ~ I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data, y = TRUE),
"lm.cubic" = function(data) lm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data, y = TRUE),
"rlm.linear" = function(data) rlm(prop ~ vocab_prop * age - 1,
data, vocab_prop = TRUE, maxit = 100),
"rlm.quadratic" = function(data) rlm(prop ~ I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data, y.ret = TRUE, maxit = 100),
"rlm.cubic" = function(data) rlm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data, y.ret = TRUE, maxit = 100)
# "glm" = function(data) glm(cbind(sum, diff) ~ vocab_prop + age, family = "binomial",
# data, y = TRUE)
)
fit_models <- function(data, models, groups = NULL, extract_fun = NULL) {
if (length(groups)) {
data %<>% group_by_(.dots = groups)
}
model_dots <- map(models, function(model) interp(~fun(data = .), fun = model))
names <- names(models)
data %<>% do_(.dots = setNames(model_dots, names))
if (length(extract_fun)) {
extract_dots <- map(names(models),
function(name) interp(~fun(var), fun = extract_fun, var = as.name(name)))
data %<>% mutate_(.dots = setNames(extract_dots, names))
}
data
}
grammar_model_metrics <- fit_models(grammar_summary, models, c("language", "measure"), adj_rsq) %>%
gather(model, adj_rsq, -language, -measure)
best_models <- grammar_model_metrics %>%
group_by(language, measure) %>%
summarise(model = model[adj_rsq == max(adj_rsq)],
adj_rsq = max(adj_rsq))
Plot model metrics.
ggplot(grammar_model_metrics, aes(x = model, y = adj_rsq, colour = model)) +
facet_grid(language ~ measure) +
geom_point() +
geom_hline(aes(yintercept = adj_rsq, colour = model), best_models) +
scale_colour_solarized() +
theme_bw(base_size = 14) +
theme(text = element_text(family = font))
Fit grammar score models and use them to predict data.
generate_predictions <- function(grammar_summary) {
grammar_models <- grammar_summary %>%
split(paste(.$language, .$measure, sep = "_")) %>%
map(function(lang_meas_data) {
lm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data = lang_meas_data)
})
newdata <- expand.grid(language = levels(grammar_summary$language),
measure = levels(grammar_summary$measure),
age = seq(min_age, max_age),
vocab_prop = seq(0, 1, vocab_step),
stringsAsFactors = FALSE)
group_predictions <- function(group) {
model <- grammar_models[[paste(unique(group$language), unique(group$measure), sep = "_")]]
group %>% mutate(predicted = predict(model, group))
}
predicted_data <- newdata %>%
split(list(.$language, .$measure)) %>%
map(group_predictions) %>%
bind_rows()
age_sizes <- grammar_summary %>%
group_by(age) %>%
summarise(n = n())
breaks <- seq(min_age - 1, max_age, length = num_age_groups + 1)
predicted_data %>%
mutate(age_group = cut(
age, breaks = breaks,
labels = map(1:(length(breaks) - 1),
function(i) paste("age", breaks[i], breaks[i + 1], sep = "."))
)) %>%
left_join(age_sizes) %>%
group_by(language, measure, age_group, vocab_prop) %>%
summarise(predicted = weighted.mean(predicted, n))
}
binned_grammar_summary <- grammar_summary %>%
mutate(age_group = cut(age, breaks = seq(min_age - 1, max_age, length = num_age_groups + 1)))
binned_grammar_predictions <- generate_predictions(grammar_summary) %>%
ungroup() %>%
mutate(age_group = unlist(map(as.character(age_group), format_group)))
Plot score as a function of vocabulary size for each language and measure with model prediction curves.
#ggsave("poster/grammar.png", width = 1120/72, height = 1220/72, dpi = 300)
ggplot(binned_grammar_summary, aes(x = vocab_prop, y = prop, colour = age_group)) +
geom_jitter(alpha = 0.3, size = 0.75) +
geom_line(aes(y = predicted), size = 0.65, data = binned_grammar_predictions) +
facet_grid(language ~ measure) +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25),
"Score (Mean Items)\n") +
theme_bw(base_size = 20) +
theme(legend.position = c(0.05, 0.95),
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"),
text = element_text(family = font)) +
scale_color_solarized(name = "Age Group\n (months)")
Estimate area between curves.
calculate_area_diffs <- function(predicted) {
age_groups <- levels(predicted$age_group)
dots <- map(1:(length(age_groups) - 1),
function(i) paste(age_groups[i + 1], age_groups[i], sep = " - "))
predicted %>%
group_by(language, measure, age_group) %>%
summarise(area = sum(predicted * vocab_step)) %>%
spread(age_group, area) %>%
mutate_(.dots = dots) %>%
select_(.dots = paste0("-", age_groups)) %>%
gather(group, diff, -language, -measure)
}
one_grammar_sample <- function(grammar_summary) {
function(k) {
grammar_summary %>%
group_by(language, measure) %>%
sample_frac(replace = TRUE) %>%
generate_predictions() %>%
calculate_area_diffs() %>%
mutate(sample = k)
}
}
get_grammar_area_diffs <- function(grammar_summary, nboot) {
map(1:nboot, one_grammar_sample(grammar_summary)) %>%
bind_rows() %>%
group_by(language, measure, group) %>%
summarise(mean = mean(diff),
ci_lower = ci_lower(diff),
ci_upper = ci_upper(diff)) %>%
mutate(group = unlist(map(as.character(group), format_group)))
}
grammar_area_diffs <- get_grammar_area_diffs(grammar_summary, 1000) %>%
ungroup() %>%
mutate(language = factor(language, levels = languages),
measure = factor(measure, levels = c("Word Form", "Complexity")))
Plot age fan estimates for each language and measure.
#ggsave("poster/grammar_diffs.png", width = 740/72, height = 340/72, dpi = 300)
ggplot(grammar_area_diffs, aes(x = language, y = mean, fill = measure)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = 0.9)) +
scale_fill_solarized(name = "") +
ylab("Area between age groups\n") +
xlab("") +
theme_bw(base_size = 20) +
theme(legend.position = c(0.1,0.9),
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"),
text = element_text(family = font))
Fit per-item models and use them to predict data.
generate_item_predictions <- function(grammar_data) {
item_models <- grammar_data %>%
split(paste(.$language, .$item, sep = "_")) %>%
map(function(lang_item_data) {
lm(value ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
data = lang_item_data)
})
items <- grammar_data %>%
select(language, item, coded_type) %>%
distinct()
ages <- seq(min_age, max_age)
vocab_props <- seq(0, 1, vocab_step)
nrep <- length(ages) * length(vocab_props)
newdata_items <- data.frame(language = rep(items$language, nrep),
item = rep(items$item, nrep)) %>%
arrange(language, item)
newdata_demos <- expand.grid(age = ages,
vocab_prop = vocab_props)
newdata <- bind_cols(newdata_items,
data.frame(age = rep(newdata_demos$age, nrow(items)),
vocab_prop = rep(newdata_demos$vocab_prop, nrow(items))))
lang_item_predictions <- function(lang_item_data) {
model <- item_models[[paste(unique(lang_item_data$language),
unique(lang_item_data$item),
sep = "_")]]
lang_item_data %>% mutate(predicted = predict(model, lang_item_data))
}
predicted_data <- newdata %>%
split(paste(.$language, .$item, sep = "_")) %>%
map(lang_item_predictions) %>%
bind_rows() %>%
left_join(items)
age_sizes <- grammar_data %>%
select(data_id, age) %>%
filter(age >= min_age, age <= max_age) %>%
distinct() %>%
group_by(age) %>%
summarise(n = n())
breaks <- seq(min_age - 1, max_age, length = num_age_groups + 1)
predicted_data %>%
mutate(age_group = cut(age,
breaks = breaks,
labels = map(1:(length(breaks) - 1),
function(i) {paste("age", breaks[i], breaks[i + 1],
sep = ".")}))) %>%
left_join(age_sizes) %>%
group_by(language, item, coded_type, age_group, vocab_prop) %>%
summarise(predicted = weighted.mean(predicted, n))
}
Estimate area between curves.
calculate_item_area_diffs <- function(item_predictions) {
age_groups <- levels(item_predictions$age_group)
dots <- map(1:(length(age_groups) - 1),
function(i) paste(age_groups[i + 1], age_groups[i], sep = " - "))
item_predictions %>%
group_by(language, item, coded_type, age_group) %>%
summarise(area = sum(predicted * vocab_step)) %>%
spread(age_group, area) %>%
mutate_(.dots = dots) %>%
select_(.dots = paste0("-", age_groups)) %>%
gather(group, diff, -language, -item, -coded_type)
}
one_item_sample <- function(grammar_data) {
function(k) {
grammar_data %>%
group_by(language, item, coded_type) %>%
sample_frac(replace = TRUE) %>%
generate_item_predictions() %>%
calculate_item_area_diffs() %>%
mutate(sample = k)
}
}
get_item_area_diffs <- function(grammar_data, nboot) {
map(1:nboot, one_item_sample(grammar_data)) %>%
bind_rows() %>%
filter(!is.na(diff)) %>%
group_by(language, item, coded_type, group) %>%
summarise(mean = mean(diff),
ci_lower = ci_lower(diff),
ci_upper = ci_upper(diff)) %>%
mutate(group = unlist(map(as.character(group), format_group)))
}
item_area_diffs <- get_item_area_diffs(grammar_data, 1000) %>%
ungroup() %>%
arrange(language, mean) %>%
mutate(order_item = paste(language, item, sep = "_"),
order_item = factor(order_item, levels = order_item))
Plot age fan estimates for each item.
#ggsave("poster/item_diffs.png", width = 1120/72, height = 660/72, dpi = 300)
ggplot(item_area_diffs, aes(x = order_item, y = mean, fill = coded_type)) +
facet_wrap(~language, scales = "free_x") +
geom_bar(position = "dodge", stat = "identity", width = 0.75) +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = 0.9)) +
scale_fill_solarized(name = "") +
scale_x_discrete(name = "", breaks = NULL) +
ylab("Area between age groups\n") +
theme_bw(base_size = 20) +
theme(legend.position = c(0.12, 0.95),
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"),
text = element_text(family = font))