# load packages
library(tidyverse)
library(tidytext)
library(ngram)
library(igraph)
library(ggraph)
library(patchwork)
library(svglite)
library(geomnet)
library(rgexf)
library(qgraph)
library(genBaRcode)
library(ggraph)
library(ggiraph)
library(tidygraph)
library(scales)
#install.packages(
# "microViz",
#repos = c(davidbarnett = "https://david-barnett.r-universe.dev", getOption("repos"))
#)
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("phyloseq")
library(microViz)
library(tidygraph)
library(plotly)
library(classInt)
A Dynamic Network Model of Bilingual Speech
About this document
This document contains the script used to analyze the data of two German-English bilingual children, “Fion” and “Silvie”, for the paper “A dynamic network model of bilingual speech”. The method is based on Ibbotson, Salnikov & Walker (2019). The data come from corpora described in more detail in e.g. Quick et al. (2018) and Koch, Endesfelder Quick & Hartmann (2025).
The “Fion” data are already publicly available on OSF and will soon be available on the Child Language Data Exchange System (CHILDES) as well. The “Silvie” data are currently in the process of being anonymized; we expect the data to become public in 2026.
Preliminaries
Loading some packages:
Data
CHI: Monolingual + mixed
<- read_csv("../../master/fion_CHI.csv")
d_fion <- read_csv("../../master/silvie_CHI.csv") d_silvie
Data wrangling
We only want to take multi-word units into account, hence we first filter them out:
# add wordcount
$wordcount <- sapply(1:nrow(d_fion),
d_fionfunction(i) wordcount(d_fion$Utterance_clean[i]))
$wordcount <- sapply(1:nrow(d_silvie),
d_silviefunction(i) wordcount(d_silvie$Utterance_clean[i]))
# only multi-word units
<- filter(d_fion, wordcount > 1)
mwu_fion <- filter(d_silvie, wordcount > 1) mwu_silvie
Adding language tags: The column “Lang_Tags” already contains word-by-word tags for the code-mixed utterances, but not for the monolingual ones. However, the “type” column contains the information whether the utterance is English, German, or mixed. Hence for all non-mixed utterances, we can fill up the Lang_Tags column with this information. Before doing so, we add a few annotations that are missing in the “Silvie” file:
<- mwu_fion %>% mutate(Lang_Tags = case_when(
mwu_fion == "darf ich this nicht aufraeumen" ~ "g g e g g",
Utterance_clean == "komm ich this nicht aufraeumen" ~ "g g e g g",
Utterance_clean == "und this my one" ~ "g e e e",
Utterance_clean == "nein no" ~ "g e",
Utterance_clean == "you did birthday in juni" ~ "e e e eg g",
Utterance_clean == "that istis ein birthday my nanny" ~ "e ge g e e e",
Utterance_clean == "und das ist von bob the builder und wendy" ~ "g g g g e e e g eg",
Utterance_clean .default = Lang_Tags
))
<- mwu_silvie %>% mutate(Lang_Tags = case_when(Utterance_clean == "und this noch" ~ "g e g",
mwu_silvie == "der postman pat" ~ "g e e",
Utterance_clean == "ja a game" ~ "g e e",
Utterance_clean == "ja this big one" ~ "g e e e",
Utterance_clean == "ja a starfish" ~ "g e e",
Utterance_clean == "ja the ribbon" ~ "g e e",
Utterance_clean == "das heisst naemlich train train train train" ~ "g g g e e e e",
Utterance_clean .default = Lang_Tags))
Also, there are some inconsistencies in the tagging that lead to more factor levels than necessary, we correct those:
$Lang_Tags <- gsub("e\\(meta\\)", "e", mwu_fion$Lang_Tags)
mwu_fion$Lang_Tags <- gsub("ge", "eg", mwu_fion$Lang_Tags)
mwu_fion$Lang_Tags <- gsub("m", "eg", mwu_fion$Lang_Tags)
mwu_fion
$Lang_Tags <- gsub("ge", "eg", mwu_silvie$Lang_Tags)
mwu_silvie$Lang_Tags <- gsub("m", "eg", mwu_silvie$Lang_Tags) mwu_silvie
Now we can proceed:
# add language tags
$Lang_Tags <- gsub("[[:punct:]]", "", mwu_fion$Lang_Tags)
mwu_fion$Lang_Tags <- gsub("[[:punct:]]", "", mwu_silvie$Lang_Tags)
mwu_silvie
# add language tags on a word-by-word-basis to the non-code-mixed utterances
$Lang_Tags <- sapply(1:nrow(mwu_fion), function(i) ifelse(is.na(mwu_fion$Lang_Tags[i]), ifelse(mwu_fion[i,]$type=="german", paste0(rep("g", mwu_fion[i,]$wordcount), collapse = " "), paste0(rep("e", mwu_fion[i,]$wordcount), collapse = " ")), mwu_fion$Lang_Tags[i]))
mwu_fion$Lang_Tags <- sapply(1:nrow(mwu_silvie), function(i) ifelse(is.na(mwu_silvie$Lang_Tags[i]), ifelse(mwu_silvie[i,]$type=="german", paste0(rep("g", mwu_silvie[i,]$wordcount), collapse = " "), paste0(rep("e", mwu_silvie[i,]$wordcount), collapse = " ")), mwu_silvie$Lang_Tags[i])) mwu_silvie
Get bigrams
As we are interested in transitional probabilities between words, we need bigrams, which we get using the unnest_tokens
function from the tidytext
package. For each bigram, we also want the language information for the individual words, which is why we also extract bigrams from the Lang_Tags column in a second step and then join the dataframes. Finally, we split up the bigrams so that word1 and word2 are in different columns, which makes it easier to calculate the transition probabilities.
<- mwu_fion %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2, drop = FALSE)
bigrams_fion <- mwu_silvie %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2, drop = FALSE)
bigrams_silvie
<- bind_cols(bigrams_fion,
bigrams_fion %>% unnest_tokens(bigram_LangTag, Lang_Tags, token = "ngrams", n = 2, drop = FALSE) %>% select(bigram_LangTag))
mwu_fion
<- bind_cols(bigrams_silvie,
bigrams_silvie %>% unnest_tokens(bigram_LangTag, Lang_Tags, token = "ngrams", n = 2, drop = FALSE) %>% select(bigram_LangTag))
mwu_silvie
# one column for each word
<- bigrams_fion %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_fion <- bigrams_silvie %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_silvie
<- bigrams_fion %>% separate(bigram_LangTag, c("LangTag1", "LangTag2"), sep = " ", remove = F)
bigrams_fion <- bigrams_silvie %>% separate(bigram_LangTag, c("LangTag1", "LangTag2"), sep = " ", remove = F)
bigrams_silvie
# add child column
<- mutate(bigrams_fion, Child = "Fion")
bigrams_fion <- mutate(bigrams_silvie, Child = "Silvie") bigrams_silvie
Function for periodization
This function adds a `Months` column to the data in which several months can be binned into larger groups. This makes it easier to try out different periodization options (or to stick with the original months data, i.e. have one network per month).
To make an informed decision about the way the data are split, let’s first take a quick look at the distribution of data across the timespan:
# number of words
%>% group_by(Month) %>% summarise(
d_fion n_words = n(),
n_files = length(unique(Filename))
%>% ggplot(aes(x=Month, y = n_words, label = n_files)) +
) geom_point() +
geom_line(group = 1) +
geom_text(position = position_stack(), vjust = -0.4) +
theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
ylab("Number of words") +
ggtitle("Fion (numbers indicate number of transcripts)") + d_silvie %>% group_by(Month) %>% summarise(
n_words = n(),
n_files = length(unique(Filename))
%>% ggplot(aes(x=Month, y = n_words, label = n_files)) +
) geom_point() +
geom_line(group = 1) +
geom_text(position = position_stack(), vjust = -0.4) +
theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
ylab("Number of words") +
ggtitle("Silvie (numbers indicate number of transcripts)")
In both datasets, we have fewer transcripts and, hence, fewer words in the later periods. Thus, it seems useful to work with three-month periods and bootstrapped 1000-word samples.
# function for getting equal-sized intervals
<- function(x, n) {
get_equal_bins
<- 1:length(unique(x))
cur_var
<- round(seq(1, length(unique(x)), by = length(unique(x)) / n))
cur_breaks
# add last number to breaks
length(cur_breaks)+1] <- cur_var[length(cur_var)]
cur_breaks[
# add 0 to var and breaks so that we can calculate +1
# below to avoid overlaps between two groups
<- c(0, cur_var)
cur_var 1] <- 0
cur_breaks[
# cur_breaks[1] <- 0 # to make sure that it starts with 1
<- lapply(1:(length(cur_breaks)-1), function(i) cur_var[((which(cur_var==cur_breaks[i])+1)):(which(cur_var==cur_breaks[1+i]))])
cur_list
return(cur_list)
}
# function for adding months column
<- function(df, n_bins) {
add_month_bins
<- get_equal_bins(df$Month, n_bins)
cur_bins
# list to dataframe
<- do.call(rbind, lapply(1:length(cur_bins), function(i) tibble(index = i,
cur_bins_df no = cur_bins[[i]])))
# get bin range
<- tibble(age = unique(df$Month),
bins_tbl01 no = 1:length(unique(df$Month)))
# join
<- left_join(cur_bins_df, bins_tbl01)
cur_bins_df
# bins in character form (from a to b)
<- sapply(1:length(cur_bins), function(i) paste0(unique(df$Month)[cur_bins[[i]][1]], "-",
cur_bins_ch unique(df$Month)[cur_bins[[i]][length(cur_bins[[i]])]]))
# in tabular form
<- tibble(bin = 1:length(cur_bins),
bins_tbl age_range = cur_bins_ch)
# add to existing dataframe
<- left_join(cur_bins_df, bins_tbl, by = c("index" = "bin"))
cur_bins_df
# return Months column
<- left_join(df, select(cur_bins_df, age, age_range), by = c("Month" = "age"))
cur_df_with_age_range
return(cur_df_with_age_range)
}
Getting periods and samples
As mentioned above, we want to work with bootstrapped 1,000-word samples from three-month periods. We therefore divide the Fion data into 21/3 = 7 bins, the Silvie data into 18/3 = 6 bins.
# how many months for Fion and Silvie?
length(unique(d_fion$Month))
[1] 21
length(unique(d_silvie$Month))
[1] 18
# add bins
<- bigrams_fion %>% add_month_bins(n_bins = 7)
bigrams_fion <- bigrams_silvie %>% add_month_bins(n_bins = 6)
bigrams_silvie
# get samples
# Fion:
for(i in 1:7) {
<- filter(bigrams_fion, age_range == levels(factor(bigrams_fion$age_range))[i])
cur_fion set.seed(i)
<- lapply(1:100, function(i) sample(1:length(unique(cur_fion$Utt_no)), 450))
cur_samples assign(paste0("bigrams_fion0", i), cur_fion[unlist(cur_samples),])
}
# Silvie:
for(i in 1:6) {
<- filter(bigrams_silvie, age_range == levels(factor(bigrams_silvie$age_range))[i])
cur_silvie set.seed(i)
<- lapply(1:100, function(i) sample(1:length(unique(cur_silvie$Utt_no)), 450))
cur_samples assign(paste0("bigrams_silvie0", i), cur_silvie[unlist(cur_samples),])
}
Getting networks
The following function calculates the actual networks by calculating transition probabilities, filtering out all instances attested less than n_min
(default: 5) times.
<- function(bigram_df, n_min = 0, modularity_measure = FALSE) {
get_network # count the bigram_df
<- bigram_df %>% group_by(LangTag1, LangTag2) %>% count(word1, word2, sort = T)
bigrams_count
# filter out all below 5
<- bigrams_count %>%
l filter(n >= n_min)
# check if there are data
if(nrow(l) > 0) {
# get bigram graph edges and vertices
<- bigrams_count %>%
bigram_graph filter(n >= n_min) %>%
%>% select(word1, word2, n) %>% graph_from_data_frame(directed = FALSE)
ungroup
# set weight attributes
<- set_edge_attr(bigram_graph, "weight", value = l$n)
bigram_graph
# set labels
V(bigram_graph)$label <- V(bigram_graph)$name
# Louvain clustering
<- cluster_louvain(bigram_graph)
lv
# add Louvain clustering to graph
V(bigram_graph)$community <- membership(lv)
# add language and word frequency as attributes
# to do so, we need a list of unigrams
# language tags of unigrams
<- bind_cols(bigram_df %>% select(Utt_no, Utterance_clean, Lang_Tags) %>% unique() %>% unnest_tokens(output = "unigram", input = "Utterance_clean", token = "ngrams", n = 1),
unigram_LangTags
select(bigram_df %>% select(Utt_no, Utterance_clean, Lang_Tags) %>% unique() %>% unnest_tokens(output = "unigram_LangTag", input = Lang_Tags, token = "ngrams", n = 1), unigram_LangTag)) %>% select(unigram, unigram_LangTag) %>% unique()
# get frequencies of individual words
<- bigram_df %>% select(Utt_no, Utterance_clean) %>% unique() %>% unnest_tokens(output = "unigram", input = Utterance_clean) %>% group_by(unigram) %>% summarise(
unigrams_freqs n = n()
)
# add unigram LangTags as attributes to the graph
V(bigram_graph)$language <- sapply(1:length(V(bigram_graph)), function(i) unigram_LangTags[which(unigram_LangTags$unigram == V(bigram_graph)$name[i]),]$unigram_LangTag[1])
# add color as attribute
V(bigram_graph)$color <- case_when(V(bigram_graph)$language == "g" ~ "salmon",
V(bigram_graph)$language == "e" ~ "deepskyblue",
V(bigram_graph)$language == "eg" ~ "tan",
.default = "grey")
# add frequency as attribute
V(bigram_graph)$Freq <- sapply(1:length(V(bigram_graph)), function(i) unigrams_freqs[which(unigrams_freqs$unigram==V(bigram_graph)$name[i]),]$n[1])
# return graph or modularity measure
if(modularity_measure) {
return(modularity(lv))
else {
} return(bigram_graph)
}
} }
Visualization
The functions created above can now be combined to create networks for different age spans.
# function for plotting
<- function(cur_network, myseed = 1999, min_freq = 0, interactive = FALSE, repel = TRUE, max.overlaps = 20) {
get_plot
# Compute layout
<- create_layout(cur_network, layout = "fr")
layout
# Build plot
set.seed(myseed)
<- ggplot(layout) +
p geom_edge_link(aes(x = x, y = y, xend = xend, yend = yend, edge_width = rescale(weight, to = c(.05, .5)), alpha = weight), color = "gray") +
geom_point_interactive(
aes(x = x, y = y, tooltip = name, color = color, size = Freq)
+
) geom_node_text(aes(label = ifelse(Freq > min_freq, name, ""), size = Freq), repel = repel, max.overlaps = max.overlaps) +
stat_ellipse(aes(x=x, y=y, group = as.factor(community), fill = as.factor(community)),
geom = "polygon", alpha = 0.1, color = NA) +
scale_color_identity() +
theme_void() +
theme(legend.position = "none")
if(interactive) {
# Zoomable plot with girafe
<- girafe(
g ggobj = p,
options = list(
opts_zoom(min = 1, max = 60),
opts_toolbar(saveaspng = TRUE)
)
)
return(g)
else {
} return(p)
}
}
Plots for publication
# create seven plot objects named p_f1 to p_f7 for Fion's networks
for(i in 1:7) {
assign(paste0("p_f", i), get(paste0("bigrams_fion0", i)) %>% get_network() %>% get_plot(repel = FALSE, min_freq = 5) + ggtitle(levels(factor(bigrams_fion$age_range))[i]) + theme(plot.title = element_text(face = "bold", hjust = 0.5)))
}
| p_f2 | p_f3) /
(p_f1 | p_f5 | p_f6) /
(p_f4 p_f7
# ggsave("images/fion_networks01.png", width = 15, height = 15)
# create six plot objects named p_f1 to p_f7 for Silvie's networks
for(i in 1:6) {
assign(paste0("p_s", i), get(paste0("bigrams_silvie0", i)) %>% get_network() %>% get_plot(repel = TRUE, min_freq = 5, max.overlaps = 50) + ggtitle(levels(factor(bigrams_silvie$age_range))[i]) + theme(plot.title = element_text(face = "bold", hjust = 0.5)))
}
| p_s2 | p_s3) /
(p_s1 | p_s5 | p_s6) (p_s4
# ggsave("images/silvie_networks01.png", width = 15, height = 15)
Plots for online viewing
# Fion
%>% get_network() %>% get_plot(repel = FALSE, interactive = TRUE) bigrams_fion01
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion02
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion03
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion04
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion05
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion06
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_fion07
# Silvie
%>% get_network() %>% get_plot(repel = FALSE, interactive = TRUE) bigrams_silvie01
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_silvie02
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_silvie03
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_silvie04
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_silvie05
%>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE) bigrams_silvie06
Modularities
# get modularities
tibble(
child = c(rep("Fion", 7), rep("Silvie", 6)),
age = c(levels(factor(bigrams_fion$age_range)),
levels(factor(bigrams_silvie$age_range))),
modularity = c(bigrams_fion01 %>% get_network(modularity_measure=TRUE),
%>% get_network(modularity_measure=TRUE),
bigrams_fion02 %>% get_network(modularity_measure=TRUE),
bigrams_fion03 %>% get_network(modularity_measure=TRUE),
bigrams_fion04 %>% get_network(modularity_measure=TRUE),
bigrams_fion05 %>% get_network(modularity_measure=TRUE),
bigrams_fion06 %>% get_network(modularity_measure=TRUE),
bigrams_fion07 %>% get_network(modularity_measure=TRUE),
bigrams_silvie01 %>% get_network(modularity_measure=TRUE),
bigrams_silvie02 %>% get_network(modularity_measure=TRUE),
bigrams_silvie03 %>% get_network(modularity_measure=TRUE),
bigrams_silvie04 %>% get_network(modularity_measure=TRUE),
bigrams_silvie05 %>% get_network(modularity_measure=TRUE))) bigrams_silvie06
# A tibble: 13 × 3
child age modularity
<chr> <chr> <dbl>
1 Fion 02;03-02;06 0.619
2 Fion 02;07-02;09 0.615
3 Fion 02;10-03;00 0.503
4 Fion 03;01-03;03 0.549
5 Fion 03;04-03;06 0.576
6 Fion 03;07-03;09 0.667
7 Fion 03;10-03;11 0.634
8 Silvie 02;04-02;07 0.577
9 Silvie 02;08-02;10 0.569
10 Silvie 02;11-03;01 0.581
11 Silvie 03;02-03;04 0.540
12 Silvie 03;05-03;07 0.571
13 Silvie 03;08-03;09 0.611