Load in Wordbank administration and item data.

data_mode = "local"
languages <- c("Norwegian", "English", "Danish", "Spanish")

admins <- get_administration_data(filter_age = FALSE, mode = data_mode) %>%
  filter(form == "WS", language %in% languages) %>%
  mutate(language = factor(language, levels = languages))

items <- get_item_data(mode = data_mode) %>%
  filter(form == "WS", language %in% languages) %>%
  mutate(language = factor(language, levels = languages),
         num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))

Set up analysis-wide constants.

vocab_step <- 0.01
min_age <- 16
max_age <- 31
num_age_groups <- 2

Show number of items in each relevant section.

sections <- items %>%
  filter(form == "WS") %>%
  group_by(language, type) %>%
  summarise(n = n()) %>%
  spread(type, n) %>%
  select(language, word, word_form, complexity)
sections[is.na(sections)] = 0
kable(sections)
language word word_form complexity
Norwegian 731 33 42
English 680 25 37
Danish 725 29 33
Spanish 680 24 37

Show total number of administrations in each language.

n_admin <- admins %>%
  group_by(language) %>%
  summarise(n = n())
kable(n_admin)
language n
Norwegian 12969
English 5824
Danish 3714
Spanish 1094

Show number of administrations in each language by age group.

n_admin_age <- admins %>%
  filter(age >= min_age, age <= max_age) %>%
  mutate(age_group = cut(age, breaks = seq(min_age - 1, max_age,
                                           length = num_age_groups + 1))) %>%
  group_by(language, age_group) %>% 
  summarise(n = n()) %>%
  spread(age_group, n)
kable(n_admin_age)
language (15,23] (23,31]
Norwegian 4128 5967
English 2967 2628
Danish 1582 1456
Spanish 582 512

Some utility functions for transforming data values.

get_coded_type <- function(type, complexity_category) {
  if (type == "complexity") {
    return(complexity_category)
  } else {
    return(type)
  }
}

get_value <- function(type, value) {
  if (type == "word_form" | type == "word") {
    return(value == "produces")
  } else if (type == "complexity") {
    return(value == "complex")
  }
}

format_group <- function(group) {
  paste(
    map(strsplit(group, " - ")[[1]],
        function(bin) {
          paste0("(", paste(strsplit(bin, "\\.")[[1]][-1], collapse = ","), "]")
        })
    , collapse = "-")
}

Get kid by item data for wordform and complexity items all languages and aggregate them.

get_lang_grammar_data <- function(lang) {
  
  lang_grammar_items <- items %>%
    filter(language == lang, type %in% c("word_form", "complexity"))
  
  lang_num_words <- nrow(filter(items, language == lang, type == "word"))
  
  lang_admins <- admins %>%
    filter(language == lang) %>%
    mutate(vocab_prop = production / lang_num_words)
  
  lang_grammar_data <- get_instrument_data(instrument_language = lang,
                                           instrument_form = "WS",
                                           items = lang_grammar_items$item_id,
                                           administrations = lang_admins,
                                           iteminfo = lang_grammar_items,
                                           mode = data_mode) %>%
    group_by(data_id, type) %>%
    mutate(no_section = all(is.na(value))) %>%
    filter(!no_section) %>%
    mutate(value = ifelse(is.na(value), "", value),
           value = get_value(unique(type), value),
           coded_type = get_coded_type(unique(type), complexity_category),
           coded_type = factor(coded_type,
                               levels = c("word_form", "morphology", "syntax"),
                               labels = c("Word Form",
                                          "Complexity (Morphological)",
                                          "Complexity (Syntactic)")),
           measure = factor(type, levels = c("word_form", "complexity"),
                            labels = c("Word Form", "Complexity"))) %>%
    ungroup() %>%
    select(-complexity_category, -no_section, -type)
  
  return(lang_grammar_data)
  
}

grammar_data <- languages %>%
  map(get_lang_grammar_data) %>%
  bind_rows() %>%
  filter(age >= min_age, age <= max_age)

Get by kid summary data for all languages.

grammar_summary <- grammar_data %>%
  group_by(language, measure, data_id, age, vocab_prop) %>%
  summarise(num_true = sum(value),
            num_false = n() - num_true,
            prop = num_true / n())

Model comparison: fit various grammar models and compare their adjusted R-squared values.

rsq <- function(object) 1 - sum(residuals(object, type = "response") ^ 2) / sum(object$y ^ 2)

adj_rsq <- function(object) {
  rsq <- rsq(object)
  p <- summary(object)$df[1]
  n_p <- summary(object)$df[2]
  rsq - (1 - rsq) * (p / (n_p - 1))
}

models <- list(
  "lm.linear" = function(data) lm(prop ~ vocab_prop * age - 1,
                                  data, y = TRUE),
  "lm.quadratic" = function(data) lm(prop ~ I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
                                     data, y = TRUE),
  "lm.cubic" = function(data) lm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
                                 data, y = TRUE),
  "rlm.linear" = function(data) rlm(prop ~ vocab_prop * age - 1,
                                    data, vocab_prop = TRUE, maxit = 100),
  "rlm.quadratic" = function(data) rlm(prop ~ I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
                                       data, y.ret = TRUE, maxit = 100),
  "rlm.cubic" = function(data) rlm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
                                   data, y.ret = TRUE, maxit = 100)
#  "glm" = function(data) glm(cbind(sum, diff) ~ vocab_prop + age, family = "binomial",
#                             data, y = TRUE)
)

fit_models <- function(data, models, groups = NULL, extract_fun = NULL) {
  if (length(groups)) {
    data %<>% group_by_(.dots = groups)
  }
  model_dots <- map(models, function(model) interp(~fun(data = .), fun = model))
  names <- names(models)
  data %<>% do_(.dots = setNames(model_dots, names))
  if (length(extract_fun)) {
    extract_dots <- map(names(models),
                        function(name) interp(~fun(var), fun = extract_fun, var = as.name(name)))
    data %<>% mutate_(.dots = setNames(extract_dots, names))
  }
  data
}

grammar_model_metrics <- fit_models(grammar_summary, models, c("language", "measure"), adj_rsq) %>%
  gather(model, adj_rsq, -language, -measure)

best_models <- grammar_model_metrics %>%
  group_by(language, measure) %>%
  summarise(model = model[adj_rsq == max(adj_rsq)],
            adj_rsq = max(adj_rsq))

Plot model metrics.

ggplot(grammar_model_metrics, aes(x = model, y = adj_rsq, colour = model)) +
  facet_grid(language ~ measure) +
  geom_point() +
  geom_hline(aes(yintercept = adj_rsq, colour = model), best_models) +
  scale_colour_solarized() +
  theme_bw(base_size = 14) +
  theme(text = element_text(family = font))

Fit grammar score models and use them to predict data.

generate_predictions <- function(grammar_summary) {
  
  grammar_models <- grammar_summary %>%
    split(paste(.$language, .$measure, sep = "_")) %>%
    map(function(lang_meas_data) {
      lm(prop ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
         data = lang_meas_data)
    })
  
  newdata <- expand.grid(language = levels(grammar_summary$language),
                         measure = levels(grammar_summary$measure),
                         age = seq(min_age, max_age),
                         vocab_prop = seq(0, 1, vocab_step),
                         stringsAsFactors = FALSE)
  
  group_predictions <- function(group) {
    model <- grammar_models[[paste(unique(group$language), unique(group$measure), sep = "_")]]
    group %>% mutate(predicted = predict(model, group))
  }
  
  predicted_data <- newdata %>%
    split(list(.$language, .$measure)) %>%
    map(group_predictions) %>%
    bind_rows()
  
  age_sizes <- grammar_summary %>%
    group_by(age) %>%
    summarise(n = n())
  
  breaks <- seq(min_age - 1, max_age, length = num_age_groups + 1)
  predicted_data %>%
    mutate(age_group = cut(
      age, breaks = breaks,
      labels = map(1:(length(breaks) - 1),
                   function(i) paste("age", breaks[i], breaks[i + 1], sep = "."))
    )) %>%
    left_join(age_sizes) %>%
    group_by(language, measure, age_group, vocab_prop) %>%
    summarise(predicted = weighted.mean(predicted, n))
}

binned_grammar_summary <- grammar_summary %>%
  mutate(age_group = cut(age, breaks = seq(min_age - 1, max_age, length = num_age_groups + 1)))

binned_grammar_predictions <- generate_predictions(grammar_summary) %>%
  ungroup() %>%
  mutate(age_group = unlist(map(as.character(age_group), format_group)))

Plot score as a function of vocabulary size for each language and measure with model prediction curves.

#ggsave("poster/grammar.png", width = 1120/72, height = 1220/72, dpi = 300)
ggplot(binned_grammar_summary, aes(x = vocab_prop, y = prop, colour = age_group)) + 
  geom_jitter(alpha = 0.3, size = 0.75) +
  geom_line(aes(y = predicted), size = 0.65, data = binned_grammar_predictions) + 
  facet_grid(language ~ measure) + 
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "\nVocabulary Size") + 
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25),
                     "Score (Mean Items)\n") + 
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.05, 0.95),
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        text = element_text(family = font)) +
  scale_color_solarized(name = "Age Group\n (months)")

Estimate area between curves.

calculate_area_diffs <- function(predicted) {
  
  age_groups <- levels(predicted$age_group)
  dots <- map(1:(length(age_groups) - 1),
              function(i) paste(age_groups[i + 1], age_groups[i], sep = " - "))
  predicted %>%
    group_by(language, measure, age_group) %>%
    summarise(area = sum(predicted * vocab_step)) %>% 
    spread(age_group, area) %>%
    mutate_(.dots = dots) %>%
    select_(.dots = paste0("-", age_groups)) %>%
    gather(group, diff, -language, -measure)
  
}

one_grammar_sample <- function(grammar_summary) {
  function(k) {
    grammar_summary %>%
      group_by(language, measure) %>%
      sample_frac(replace = TRUE) %>%
      generate_predictions() %>%
      calculate_area_diffs() %>%
      mutate(sample = k)
  }
}

get_grammar_area_diffs <- function(grammar_summary, nboot) {
  map(1:nboot, one_grammar_sample(grammar_summary)) %>%
    bind_rows() %>%
    group_by(language, measure, group) %>%
    summarise(mean = mean(diff),
              ci_lower = ci_lower(diff),
              ci_upper = ci_upper(diff)) %>%
    mutate(group = unlist(map(as.character(group), format_group)))
}

grammar_area_diffs <- get_grammar_area_diffs(grammar_summary, 1000) %>%
  ungroup() %>%
  mutate(language = factor(language, levels = languages),
         measure = factor(measure, levels = c("Word Form", "Complexity")))

Plot age fan estimates for each language and measure.

#ggsave("poster/grammar_diffs.png", width = 740/72, height = 340/72, dpi = 300)
ggplot(grammar_area_diffs, aes(x = language, y = mean, fill = measure)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position = position_dodge(width = 0.9)) +
  scale_fill_solarized(name = "") +
  ylab("Area between age groups\n") +
  xlab("") +
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.1,0.9),
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        text = element_text(family = font))

Fit per-item models and use them to predict data.

generate_item_predictions <- function(grammar_data) {
  
  item_models <- grammar_data %>%
    split(paste(.$language, .$item, sep = "_")) %>%
    map(function(lang_item_data) {
      lm(value ~ I(vocab_prop ^ 3) * age + I(vocab_prop ^ 2) * age + vocab_prop * age - 1,
         data = lang_item_data)
    })
  
  items <- grammar_data %>%
    select(language, item, coded_type) %>%
    distinct()
  
  ages <- seq(min_age, max_age)
  vocab_props <- seq(0, 1, vocab_step)
  nrep <- length(ages) * length(vocab_props)
  
  newdata_items <- data.frame(language = rep(items$language, nrep),
                              item = rep(items$item, nrep)) %>%
    arrange(language, item)
  
  newdata_demos <- expand.grid(age = ages,
                               vocab_prop = vocab_props)
  
  newdata <- bind_cols(newdata_items,
                       data.frame(age = rep(newdata_demos$age, nrow(items)),
                                  vocab_prop = rep(newdata_demos$vocab_prop, nrow(items))))
  
  lang_item_predictions <- function(lang_item_data) {
    model <- item_models[[paste(unique(lang_item_data$language),
                                unique(lang_item_data$item),
                                sep = "_")]]
    lang_item_data %>% mutate(predicted = predict(model, lang_item_data))
  }
  
  predicted_data <- newdata %>%
    split(paste(.$language, .$item, sep = "_")) %>%
    map(lang_item_predictions) %>%
    bind_rows() %>%
    left_join(items)
  
  age_sizes <- grammar_data %>%
    select(data_id, age) %>%
    filter(age >= min_age, age <= max_age) %>%
    distinct() %>%
    group_by(age) %>%
    summarise(n = n())
  
  breaks <- seq(min_age - 1, max_age, length = num_age_groups + 1)
  predicted_data %>%
    mutate(age_group = cut(age,
                           breaks = breaks,
                           labels = map(1:(length(breaks) - 1),
                                        function(i) {paste("age", breaks[i], breaks[i + 1],
                                                           sep = ".")}))) %>%
    left_join(age_sizes) %>%
    group_by(language, item, coded_type, age_group, vocab_prop) %>%
    summarise(predicted = weighted.mean(predicted, n))
}

Estimate area between curves.

calculate_item_area_diffs <- function(item_predictions) {
  
  age_groups <- levels(item_predictions$age_group)
  dots <- map(1:(length(age_groups) - 1),
              function(i) paste(age_groups[i + 1], age_groups[i], sep = " - "))
  item_predictions %>%
    group_by(language, item, coded_type, age_group) %>%
    summarise(area = sum(predicted * vocab_step)) %>% 
    spread(age_group, area) %>%
    mutate_(.dots = dots) %>%
    select_(.dots = paste0("-", age_groups)) %>%
    gather(group, diff, -language, -item, -coded_type)
  
}

one_item_sample <- function(grammar_data) {
  function(k) {
    grammar_data %>%
      group_by(language, item, coded_type) %>%
      sample_frac(replace = TRUE) %>%
      generate_item_predictions() %>%
      calculate_item_area_diffs() %>%
      mutate(sample = k)
  }
}

get_item_area_diffs <- function(grammar_data, nboot) {
  map(1:nboot, one_item_sample(grammar_data)) %>%
    bind_rows() %>%
    filter(!is.na(diff)) %>%
    group_by(language, item, coded_type, group) %>%
    summarise(mean = mean(diff),
              ci_lower = ci_lower(diff),
              ci_upper = ci_upper(diff)) %>%
    mutate(group = unlist(map(as.character(group), format_group)))
}

item_area_diffs <- get_item_area_diffs(grammar_data, 1000) %>%
  ungroup() %>%
  arrange(language, mean) %>%
  mutate(order_item = paste(language, item, sep = "_"),
         order_item = factor(order_item, levels = order_item))

Plot age fan estimates for each item.

#ggsave("poster/item_diffs.png", width = 1120/72, height = 660/72, dpi = 300)
ggplot(item_area_diffs, aes(x = order_item, y = mean, fill = coded_type)) +
  facet_wrap(~language, scales = "free_x") +
  geom_bar(position = "dodge", stat = "identity", width = 0.75) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position = position_dodge(width = 0.9)) +
  scale_fill_solarized(name = "") +
  scale_x_discrete(name = "", breaks = NULL) +
  ylab("Area between age groups\n") +
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.12, 0.95),
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        text = element_text(family = font))