This document contains the source code of Quick & Hartmann (under review). A small sample of the data used in the paper is available here: https://github.com/hartmast/Quick_Hartmann_2021_Building_Blocks. We plan to publish the entire corpus in due time, but currently it still has to be revised in order to make sure that the data are fully anonymized.
First we load a number of packages.
# install non-CRAN package "wizard" if not yet installed
if(!require("devtools")) install.packages("devtools")
if(!require("wizard")) devtools::install_github("hartmast/wizard")
library(tidyverse)
library(tidytext)
library(ngram)
library(quanteda)
library(parallel)
library(pbmcapply)
library(wizard)
library(patchwork)
library(RColorBrewer)
library(lme4)
library(effects)
library(lmerTest)
library(igraph)
library(ggraph)
library(patchwork)
library(svglite)
We define a first helper function that searches for entire words (i.e. strings preceded and followed by either a whitespace or the beginning or end of the element.)
# convenience function: grep whole word
<- function(pattern, x, ...) {
grepw grep(paste("(^| )", pattern, "(?=( |$))|(?<= |^)", pattern, "( |$)", sep="", collapse=""), x, perl = T, ...)
}
# the same but returns binary result (like grepl):
<- function(pattern, x, ...) {
greplw grepl(paste("(^| )", pattern, "(?=( |$))|(?<= |^)", pattern, "( |$)", sep="", collapse=""), x, perl = T, ...)
}
The next two helper functions are jointly used to generate the pattern candidates from the data. The first replaces the n-th word in a string by XXX, the second simply gets the indices of words to be replaced. Each word and each bigram is supposed to be replaced once in the resulting list of pattern candidates, i.e. if we have an utterance consisting of 4 words, the results would be 1, 2, 3, 4, c(1,2), c(2,3), and c(3,4). The two functions are combined in all_xxx, which returns the list of pattern candidates as shown in the example below. In addition, it appends the original utterance as the last element of the list.
# function for replacing n-th word by xxx
<- function(x, n) {
xxx_this <- unlist(strsplit(x, " "))
x <- "XXX"
x[n] <- paste0(x, collapse = " ")
x return(x)
}
# function for required replacement indices
<- function(x) {
xxx_l <- unlist(strsplit(x, " "))
x <- as.list(1:length(x))
l1
if(length(x) > 1) {
<- lapply(1:(length(x)-1), function(i) c(i:(i+1)))
l2 <- c(l1, l2)
l else {
} <- l1
l
}
return(l)
}
# combine both functions: get all possible
# replacements of 1 or 2 words,
# then all possible 1- to 4-grams
<- function(x) {
all_xxx
# collector
<- list()
my_list
# iterate over all vectors
for(i in 1:length(x)) {
<- xxx_l(x[i])
l_cur <- lapply(1:length(l_cur),
y function(j) xxx_this(x[i], l_cur[[j]]))
length(y)+1]] <- x[i]
y[[<- y
my_list[[i]]
}
return(my_list)
}
all_xxx("This is a test")
## [[1]]
## [[1]][[1]]
## [1] "XXX is a test"
##
## [[1]][[2]]
## [1] "This XXX a test"
##
## [[1]][[3]]
## [1] "This is XXX test"
##
## [[1]][[4]]
## [1] "This is a XXX"
##
## [[1]][[5]]
## [1] "XXX XXX a test"
##
## [[1]][[6]]
## [1] "This XXX XXX test"
##
## [[1]][[7]]
## [1] "This is XXX XXX"
##
## [[1]][[8]]
## [1] "This is a test"
Another helper function identifies the longest consecutive string that is not XXX. This will be used to rank the pattern candidates: In line with previous traceback approaches, the ones with the longest consecutive strings are preferred.
# function for getting the longest subsequent string ----------------------
# (i.e. if there are multiple candidates of frame-
# and-slot patterns, identify the one with the longest
# subsequent fixed string, i.e. string that is not XXX)
<- function(x) {
get_successive_strings <- unlist(strsplit(x, " "))
x1 <- which(x1 == "XXX") # find XXX
x2 <- min(x2) - 1
x_before <- length(x1) - max(x2)
x_after <- max(x_before, x_after)
x3 return(x3)
}
And finally, it will also be important to find out which word has been replaced by XXX. This is the purpose of this last helper function.
# find out which word(s) has/have been replaced by XXX
<- function(x) {
which_xxx <- as.character(x)
x <- sapply(1:length(x),
x function(i) which(unlist(strsplit(x[i], " "))=="XXX"))
return(x)
}
Next, we read in the datasets.
<- read_csv("../master/fion_CHI.csv")
fion <- read_csv("../master/fion_input.csv")
fion_input <- read_csv("../master/silvie_CHI.csv")
silvie <- read_csv("../master/silvie_input.csv") silvie_input
As single-word utterances are not interesting for our purposes, we reduce the children’s data to multi-word utterances.
# only multi-word units ---------------------------------------------------
<- filter(fion, Wordcount > 1)
fion_mwu <- filter(silvie, Wordcount > 1) silvie_mwu
We now create a list of pattern candidates by applying the function defined above to all instances.
# to use the function more efficiently, we vectorize it
<- Vectorize(all_xxx)
all_xxx_v
# now we apply it to all instances
# both of the input and the MWUs.
<- all_xxx_v(fion_input$Utterance_clean)
fion_input_xxx <- all_xxx_v(fion_mwu$Utterance_clean)
fion_mwu_xxx <- all_xxx_v(filter(fion_mwu, type != "mixed")$Utterance_clean)
fion_mono_xxx <- all_xxx_v(filter(fion_mwu, type == "mixed")$Utterance_clean)
fion_mixed_xxx
<- all_xxx_v(silvie_input$Utterance_clean)
silvie_input_xxx <- all_xxx_v(silvie_mwu$Utterance_clean)
silvie_mwu_xxx <- all_xxx_v(filter(silvie_mwu, type != "mixed")$Utterance_clean)
silvie_mono_xxx <- all_xxx_v(filter(silvie_mwu, type == "mixed")$Utterance_clean) silvie_mixed_xxx
This brings us to the heart of the traceback method: We check whether the full utterance or one of the detected patterns can be found in the main corpus, i.e. a) the input, b) the child’s monolingual data.
In a first step, we check whether there is a verbatim match in the main corpus. We use the multi-core version of lapply
, pbclapply
from the parallel
package. The algorithm uses the greplw
function defined above, which returns TRUE for all utterance in with the string we search for is attested, and checks whether it returns TRUE for any utterance in the respective main corpus. Here we work with utterance types instead of tokens as some utterances are used more than once.
# list of utterance types
<- c(fion_input$Utterance_clean, fion_mwu$Utterance_clean, silvie$Utterance_clean, silvie_mwu$Utterance_clean) %>% unique
utterance_types
# data frame indicating for each utterance whether it occurs in the respective main corpus
<- tibble(
utterance_types_matches Utterance_clean = utterance_types,
exact_match_silvie_input = unlist(pbmclapply(1:length(utterance_types), function(i)
any(greplw(utterance_types[i], silvie_input$Utterance_clean)), mc.cores = 6)),
exact_match_fion_input = unlist(pbmclapply(1:length(utterance_types), function(i)
any(greplw(utterance_types[i], fion_input$Utterance_clean)), mc.cores = 6))
)
We now join the new dataframe with the existing ones.
<- left_join(fion_mwu, utterance_types_matches)
fion_mwu <- left_join(silvie_mwu, utterance_types_matches) silvie_mwu
Now, we get the number of successive strings in each pattern using the function defined above. This information will later be used to rank the patterns.
# vectorized version:
<- Vectorize(get_successive_strings)
get_successive_strings_v
# for Fion:
<- pbmclapply(1:length(fion_mwu_xxx),
fion_mwu_xxx_successive function(i) suppressWarnings(get_successive_strings_v(fion_mwu_xxx[[i]])))
# for Silvie:
<- pbmclapply(1:length(silvie_mwu_xxx),
silvie_mwu_xxx_successive function(i) suppressWarnings(get_successive_strings_v(silvie_mwu_xxx[[i]])))
We also have to find out which word(s) has/have been replaced in each pattern candidate, using the appropriate helper function defined above.
<-
fion_mwu_which_xxx pbmclapply(1:length(fion_mwu_xxx),
function(i) which_xxx(fion_mwu_xxx[[i]]), mc.cores = 6)
<-
silvie_mwu_which_xxx pbmclapply(1:length(silvie_mwu_xxx),
function(i) which_xxx(silvie_mwu_xxx[[i]]), mc.cores = 6)
As we want to know whether the omitted words occur in the main corpus, we compile a list of tokens and determine for each token whether or not it occurs in the monolingual and in the input data.
# get list of words and check if they are attested
# in a) Fion's monolingual data, b) Fion's input data,
# c) Silvie's monolingual data, d) Silvie' input data
<- c(
all_tokens tokens(filter(silvie, type != "mixed")$Utterance_clean) %>% unlist %>% as.character() %>% unique,
tokens(filter(fion, type != "mixed")$Utterance_clean) %>% unlist %>% as.character() %>% unique,
tokens(silvie_input$Utterance_clean) %>% unlist %>% as.character() %>% unique,
tokens(fion_input$Utterance_clean) %>% unlist %>% as.character() %>% unique
%>% unique )
## Warning: NA is replaced by empty string
<- tibble(
all_tokens_check token = all_tokens,
silvie_mono = all_tokens %in% as.character(unlist(tokens(filter(silvie, type != "mixed")$Utterance_clean))),
fion_mono = all_tokens %in% as.character(unlist(tokens(filter(fion, type != "mixed")$Utterance_clean))),
silvie_input = all_tokens %in% as.character(unlist(tokens(silvie_input$Utterance_clean))),
fion_input = all_tokens %in% as.character(unlist(tokens(fion_input$Utterance_clean)))
)
## Warning: NA is replaced by empty string
Now we have to check, for each pattern, whether the omitted word occurs in the respective main corpus. Otherwise, it will not be counted as a valid pattern.
For this purpose, we first define a helper function:
<- function(x, col) {
find_in_tokens # if the token is "" (i.e. "character(0)"), proceed
if(!any(identical(x, character(0)))) {
# otherwise, find it in the list of tokens
if(all(x %in% all_tokens)) {
return(all(sapply(1:length(x),
function(i)
as.logical(all_tokens_check[which(all_tokens_check$token==x[i]), which(colnames(all_tokens_check)==col)]))))
else {
} return(FALSE)
}else {
} return(FALSE)
}
}
For each word, we now want to know the omitted word(s). As in some patterns, one word has been replaced by XXX and in others two, we work with two subsets because if two words have been replaced, the pattern only counts as valid - in combination with the concrete utterance - if the two omitted words are attested consecutively in the main corpus.
# first, get tibble with all candidate patterns
<- tibble(
pattern_candidates pattern = c(unlist(fion_mwu_xxx), unlist(silvie_mwu_xxx)) %>% as.character
)
# find the ones that don't have XXX in them
# (those are the targets)
<- grep("XXX", pattern_candidates$pattern, invert = T)
no_xxx $target <- character(nrow(pattern_candidates))
pattern_candidates$target <- c(rep(pattern_candidates$pattern[no_xxx[1]], no_xxx[1]),
pattern_candidatesunlist(sapply(2:length(no_xxx),
function(i)
rep(pattern_candidates$pattern[no_xxx[i]],
nrow(pattern_candidates[no_xxx[i-1]:(no_xxx[i]-1),])))))
# remove no_xxx
<- pattern_candidates[-no_xxx,]
pattern_candidates
# remove "XXX XXX" as this is not a valid pattern
<- pattern_candidates[-which(pattern_candidates=="XXX XXX"),]
pattern_candidates
# get only the ones that contain "XXX" but not "XXX XXX" (i.e. those in which only one word has been replaced)
<- pattern_candidates[grep("XXX XXX", pattern_candidates$pattern, invert = T),]
pattern_candidates1
# and now the ones in which two words have been replaced
<- pattern_candidates[grep("XXX XXX", pattern_candidates$pattern),] pattern_candidates2
Now we check whether the pattern candidates are attested in the main corpus.
$fion_input <-
pattern_candidates1unlist(pbmclapply(1:nrow(pattern_candidates1),
function(i) find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_input"), mc.cores = 6))
$silvie_input <-
pattern_candidates1unlist(pbmclapply(1:nrow(pattern_candidates1),
function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_input"), mc.cores = 6))
$fion <-
pattern_candidates1unlist(pbmclapply(1:nrow(pattern_candidates1),
function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion"), mc.cores = 6))
$silvie <-
pattern_candidates1unlist(pbmclapply(1:nrow(pattern_candidates1),
function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie"), mc.cores = 6))
$fion_mono <- unlist(pbmclapply(1:nrow(pattern_candidates1),
pattern_candidates1function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_mono"), mc.cores = 6))
$fion_mixed <- unlist(pbmclapply(1:nrow(pattern_candidates1),
pattern_candidates1function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_mixed"), mc.cores = 6))
$silvie_mono <- unlist(pbmclapply(1:nrow(pattern_candidates1),
pattern_candidates1function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_mono"), mc.cores = 6))
$silvie_mixed <- unlist(pbmclapply(1:nrow(pattern_candidates1),
pattern_candidates1function(i)
find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_mixed"), mc.cores = 6))
For the second batch of pattern candidates - those with 2 omitted words - we do the same but with bigrams
# add column with omitted bigrams
$omitted_bigram <- unlist(pbmclapply(1:nrow(pattern_candidates2),
pattern_candidates2function(i)
paste0(unlist(strsplit(pattern_candidates2$target[i], " "))[which(unlist(strsplit(pattern_candidates2$pattern[i], " ")) == "XXX")], collapse = " "), mc.cores = 6))
# search the omitted bigram in the data
$fion_input <- logical(nrow(pattern_candidates2))
pattern_candidates2which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(fion_input$Utterance_clean), concatenator = " ")))),]$fion_input <- TRUE
pattern_candidates2[
$silvie_input <- logical(nrow(pattern_candidates2))
pattern_candidates2which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(silvie_input$Utterance_clean), concatenator = " ")))),]$silvie_input <- TRUE pattern_candidates2[
## Warning: NA is replaced by empty string
## Warning: NA is replaced by empty string
$fion_mono <- logical(nrow(pattern_candidates2))
pattern_candidates2which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(filter(fion_mwu, type != "mixed")$Utterance_clean), concatenator = " ")))),]$fion_mono <- TRUE
pattern_candidates2[
$silvie_mono <- logical(nrow(pattern_candidates2))
pattern_candidates2which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(filter(silvie_mwu, type != "mixed")$Utterance_clean), concatenator = " ")))),]$silvie_mono <- TRUE pattern_candidates2[
Now we combine the two:
<- rbind(pattern_candidates1, pattern_candidates2[,-which(colnames(pattern_candidates2)=="omitted_bigram")]) pattern_candidates
Now we have to find out how often each pattern occurs in the respective main corpus, as we work with a frequency threshold of 2. For this purpose, we work with pattern types instead of tokens.
# list of pattern types
<- tibble(
pattern_types pattern = unique(pattern_candidates$pattern)
)
# get successive strings in all pattern candidates
$successive <- pbmclapply(1:nrow(pattern_types),
pattern_typesfunction(i)
get_successive_strings_v(pattern_types$pattern[i]))
$successive <- as.numeric(unlist(pattern_types$successive))
pattern_types
# find number of pattern matches in the four corpora
$pattern_match_fion_input <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
pattern_typeslength(grepw(gsub("XXX( XXX)?", ".+",
$pattern[i]),
pattern_types$Utterance_clean)), mc.cores = 6))
fion_input$pattern_match_silvie_input <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
pattern_typeslength(grepw(gsub("XXX( XXX)?", ".+",
$pattern[i]),
pattern_types$Utterance_clean)), mc.cores = 6))
silvie_input$pattern_match_silvie <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
pattern_typeslength(grepw(gsub("XXX( XXX)?", ".+",
$pattern[i]),
pattern_types$Utterance_clean)), mc.cores = 6))
silvie$pattern_match_silvie_mono <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
pattern_typeslength(grepw(gsub("XXX( XXX)?", ".+",
$pattern[i]),
pattern_typesfilter(silvie, type %in% c("english", "german"))$Utterance_clean)), mc.cores = 6))
$pattern_match_fion_mono <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
pattern_typeslength(grepw(gsub("XXX( XXX)?", ".+",
$pattern[i]),
pattern_typesfilter(fion, type %in% c("english", "german"))$Utterance_clean)), mc.cores = 6))
# merge with pattern_candidates
<- left_join(pattern_candidates, pattern_types) pattern_candidates
Now we check for every target utterance if the traceback is successful by checking whether a) the candidate pattern is attested at least twice in the main corpus, b) the omitted word(s) is/are attested in the main corpus. In many cases, more than one pattern candidate is valid; in these cases, pattern candidates with longer subsequent strings and pattern candidates with utterance-initial fixed strings (rather than XXX) are preferred, i.e. [this is XXX] is preferred over [this XXX cake] (longer consecutive fixed strings: 2 words vs. 1 word) and over [XXX is cake] (utterance-initial vs. utterance-internal/-final open slot).
# add column indicating traceback success &
# column indicating pattern
$success_fion_input <- logical(nrow(fion_mwu))
fion_mwu$pattern_fion_input <- character(nrow(fion_mwu))
fion_mwu
$success_silvie_input <- logical(nrow(silvie_mwu))
silvie_mwu$pattern_silvie_input <- character(nrow(silvie_mwu))
silvie_mwu
$success_silvie_input <- logical(nrow(fion_mwu))
fion_mwu$pattern_silvie_input <- character(nrow(fion_mwu))
fion_mwu
$success_fion_input <- logical(nrow(silvie_mwu))
silvie_mwu$pattern_fion_input <- character(nrow(silvie_mwu))
silvie_mwu
# fill those columns for the rows that
# have already been identified as exact matches
which(fion_mwu$exact_match_fion_input==TRUE),]$success_fion_input <- TRUE
fion_mwu[which(fion_mwu$exact_match_fion_input==TRUE),]$pattern_fion_input <-
fion_mwu[which(fion_mwu$exact_match_fion_input==TRUE),]$Utterance_clean
fion_mwu[
which(silvie_mwu$exact_match_silvie_input==TRUE),]$success_silvie_input <- TRUE
silvie_mwu[which(silvie_mwu$exact_match_silvie_input==TRUE),]$pattern_silvie_input <-
silvie_mwu[which(silvie_mwu$exact_match_silvie_input==TRUE),]$Utterance_clean
silvie_mwu[
which(fion_mwu$exact_match_silvie_input==TRUE),]$success_silvie_input <- TRUE
fion_mwu[which(fion_mwu$exact_match_silvie_input==TRUE),]$pattern_silvie_input <-
fion_mwu[which(fion_mwu$exact_match_silvie_input==TRUE),]$Utterance_clean
fion_mwu[
which(silvie_mwu$exact_match_fion_input==TRUE),]$success_fion_input <- TRUE
silvie_mwu[which(silvie_mwu$exact_match_fion_input==TRUE),]$pattern_fion_input <-
silvie_mwu[which(silvie_mwu$exact_match_fion_input==TRUE),]$Utterance_clean silvie_mwu[
We define a function that allows us to trace back the frame-and-slot patterns. As arguments, it requires a) the test corpus (e.g. the data frame fion_mwu), b) the main corpus (e.g. fion_input), c) the pattern list
<- function(test_corpus, main_corpus, xxx) {
traceback for(k in 1:nrow(test_corpus)) {
# column name for checking verbatim match in main corpus
<- which(colnames(test_corpus)==paste0("exact_match_", main_corpus))
col_em
if(test_corpus[k, col_em] == FALSE) {
# list of patterns:
<- xxx[[k]]
l_cur
# the last one (= literal utterance/target)
# can be discarded
<- l_cur[length(l_cur)]
target <- unlist(target)
target <- l_cur[1:(length(l_cur)-1)]
l_cur
# unlist
<- unlist(l_cur)
l_cur
# add info from pattern_candidates table
<- tibble(pattern = l_cur, target = target)
l_cur <- left_join(l_cur, pattern_candidates)
l_cur
# keep only those with pattern_match >= 2 in the
# target corpus & token_match = TRUE in the target
# corpus
if(main_corpus == "fion_input") {
<- filter(l_cur, fion_input == TRUE &
l_cur >= 2)
pattern_match_fion_input else if(main_corpus == "silvie_input") {
} <- filter(l_cur, silvie_input == TRUE &
l_cur >= 2)
pattern_match_silvie_input else if(main_corpus == "fion_mono") {
} <- filter(l_cur, fion_mono == TRUE &
l_cur >= 2)
pattern_match_fion_mono else if(main_corpus=="silvie_mono") {
} <- filter(l_cur, silvie_mono == TRUE &
l_cur >= 2)
pattern_match_silvie_mono
}
if(nrow(l_cur) > 0) {
# specify if open slot is utterance-initial (if so,
# it will be dispreferred)
$utterance_initial_xxx <- grepl("^XXX", l_cur$pattern)
l_cur%>% arrange(utterance_initial_xxx)
l_cur
}
# sort by length of successive strings
if(nrow(l_cur) > 0) {
<- l_cur %>% arrange(desc(successive))
l_cur
}
# get the relevant columns for adding the results
<- which(colnames(test_corpus) == paste0("success_", main_corpus))
col1 <- which(colnames(test_corpus) == paste0("pattern_", main_corpus))
col2
# the first one wins out
if(nrow(l_cur) > 0) {
<- TRUE
test_corpus[k, col1] <- l_cur$pattern[1]
test_corpus[k, col2] else {
} <- FALSE
test_corpus[k, col1]
}
}
print(k)
}
return(test_corpus)
}
As we want to use the code-mixed data as test corpus, we create a data frame only with the code-mixed utterances.
# subset with code-mixed utterances
<- filter(fion_mwu, type == "mixed")
fion_mixed <- filter(silvie_mwu, type == "mixed")
silvie_mixed
# empty columns for success values and patterns
# (we don't have to bother about exact matches here:
# there are none, as code-mixed and monolingual data
# are two entirely distinct sets)
$success_fion_mono <- logical(nrow(fion_mixed))
fion_mixed$success_silvie_mono <- logical(nrow(fion_mixed))
fion_mixed$exact_match_fion_mono <- FALSE
fion_mixed$exact_match_silvie_mono <- FALSE
fion_mixed
$success_silvie_mono <- logical(nrow(silvie_mixed))
silvie_mixed$success_fion_mono <- logical(nrow(silvie_mixed))
silvie_mixed$exact_match_silvie_mono <- FALSE
silvie_mixed$exact_match_fion_mono <- FALSE silvie_mixed
Now we apply the function to the cases we are interested in.
# Fion --> Own input
<- traceback(test_corpus = fion_mwu, main_corpus = "fion_input", xxx = fion_mwu_xxx)
fion_mwu
# Silvie --> Own input
<- traceback(test_corpus = silvie_mwu, main_corpus = "silvie_input", xxx = silvie_mwu_xxx)
silvie_mwu
# Fion --> Silvie's input
<- traceback(test_corpus = fion_mwu, main_corpus = "silvie_input", xxx = fion_mwu_xxx)
fion_mwu
# Silvie --> Fion's input
<- traceback(test_corpus = silvie_mwu, main_corpus = "fion_input", xxx = silvie_mwu_xxx)
silvie_mwu
# Fion mixed --> Own monolingual data
<- traceback(test_corpus = fion_mixed, main_corpus = "fion_mono", xxx = fion_mixed_xxx)
fion_mixed
# Silvie mixed --> Own monolingual data
<- traceback(test_corpus = silvie_mixed, main_corpus = "silvie_mono", xxx = silvie_mixed_xxx)
silvie_mixed
# Fion mixed --> Silvie's monolingual data
<- traceback(test_corpus = fion_mixed, main_corpus = "silvie_mono", xxx = fion_mixed_xxx)
fion_mixed
# Silvie mixed --> Fion's monolingual data
<- traceback(test_corpus = silvie_mixed, main_corpus = "fion_mono", xxx = silvie_mixed_xxx) silvie_mixed
For visualization, we use the qbarplot
function from the wizard
package. As we want to get a ternary result (fixed string, frame+slot pattern, or fail), we first define a function that allows adding a column thct contains this information.
# function for getting traceback success
# as a ternary variable
<- function(x, col1, col2) {
tb_ternary
# add ternary traceback success
<- sapply(1:nrow(x), function(i)
x1 if(x[i,which(colnames(x)==col1)] == TRUE
& x[i, which(colnames(x) == col2)] == TRUE ) "exact match"
else if (x[i,which(colnames(x)==col1)] == FALSE &
which(colnames(x) == col2)] == TRUE ) "frame + slot" else
x[i, "fail")
<- factor(x1, levels = c("fail", "frame + slot", "exact match"))
x1 <- droplevels(x1)
x1
return(x1)
}
# Child_mixed --> Own monolingual data
<- rbind(mutate(select(silvie_mixed, Month),
( p1 Child = "Silvie", Traceback_success = tb_ternary(silvie_mixed, "exact_match_silvie_mono", "success_silvie_mono")),
mutate(select(fion_mixed, Month),
Child = "Fion", Traceback_success = tb_ternary(fion_mixed, "exact_match_fion_mono", "success_fion_mono"))) %>%
qbarplot(x = "Child", fill = Traceback_success) +
guides(fill = guide_legend(title = "Traceback success")) +
ggtitle("Test corpus: Child - mixed,\nmain corpus: Own monolingual data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
<- rbind(mutate(select(silvie_mixed, Month),
( p2 Child = "Silvie", Traceback_success = tb_ternary(silvie_mixed, "exact_match_fion_mono", "success_fion_mono")),
mutate(select(fion_mixed, Month),
Child = "Fion", Traceback_success = tb_ternary(fion_mixed, "exact_match_silvie_mono", "success_silvie_mono"))) %>%
qbarplot(x = "Child", fill = Traceback_success,
color = brewer.pal(3, "Greens")[2:3]) +
guides(fill = guide_legend(title = "Traceback success")) +
ggtitle("Test corpus: Child - mixed,\nmain corpus: Other child's monolingual data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
# Child - mixed --> Input data
<- rbind(
mixed_to_input select(filter(fion_mwu, type == "mixed"), Month, exact_match_fion_input, success_fion_input, exact_match_silvie_input, success_silvie_input) %>% mutate (Child = "Fion", Traceback_success_within = tb_ternary(filter(fion_mwu, type == "mixed"), "exact_match_fion_input", "success_fion_input"), Traceback_success_cross = tb_ternary(filter(fion_mwu, type == "mixed"), "exact_match_silvie_input", "success_silvie_input")) %>% rename(exact_match_within = exact_match_fion_input, success_within = success_fion_input, exact_match_cross = exact_match_silvie_input, success_cross = success_silvie_input),
select(filter(silvie_mwu, type == "mixed"), Month, exact_match_silvie_input, success_silvie_input, exact_match_fion_input, success_fion_input) %>% mutate (Child = "Silvie", Traceback_success_within = tb_ternary(filter(silvie_mwu, type == "mixed"), "exact_match_silvie_input", "success_silvie_input"), Traceback_success_cross = tb_ternary(filter(silvie_mwu, type == "mixed"), "exact_match_fion_input", "success_fion_input")) %>% rename(exact_match_within = exact_match_silvie_input, success_within = success_silvie_input, exact_match_cross = exact_match_fion_input, success_cross = success_fion_input)
)
<- qbarplot(mixed_to_input, x = "Child", fill = "Traceback_success_within") + guides(fill = guide_legend(title = "Traceback success")) +
( p3 ggtitle("Test corpus: Child - mixed,\nmain corpus: Own input data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
<- qbarplot(mixed_to_input, x = "Child", fill = "Traceback_success_cross", color = brewer.pal(3, "Greens")) + guides(fill = guide_legend(title = "Traceback success")) +
( p4 ggtitle("Test corpus: Child - mixed,\nmain corpus: Other child's input data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
# export
# p1 | p2
# ggsave("mixed_to_mono.png", width = 14, height = 6)
#
# p3 | p4
# ggsave("mixed_to_input.png", width = 14, height = 6)
As the traceback algorithm has been applied to the entire dataset, not just the code-mixed data, we are now interested in the results for the full dataset.
<- rbind(
all_to_input select(fion_mwu, Month, exact_match_fion_input, success_fion_input, exact_match_silvie_input, success_silvie_input) %>% mutate (Child = "Fion", Traceback_success_within = tb_ternary(fion_mwu, "exact_match_fion_input", "success_fion_input"), Traceback_success_cross = tb_ternary(fion_mwu, "exact_match_silvie_input", "success_silvie_input")) %>% rename(exact_match_within = exact_match_fion_input, success_within = success_fion_input, exact_match_cross = exact_match_silvie_input, success_cross = success_silvie_input),
select(silvie_mwu, Month, exact_match_silvie_input, success_silvie_input, exact_match_fion_input, success_fion_input) %>% mutate (Child = "Silvie", Traceback_success_within = tb_ternary(silvie_mwu, "exact_match_silvie_input", "success_silvie_input"), Traceback_success_cross = tb_ternary(silvie_mwu, "exact_match_fion_input", "success_fion_input")) %>% rename(exact_match_within = exact_match_silvie_input, success_within = success_silvie_input, exact_match_cross = exact_match_fion_input, success_cross = success_fion_input)
)
<- qbarplot(all_to_input, x = "Child", fill = "Traceback_success_within") + guides(fill = guide_legend(title = "Traceback success")) +
( p1a ggtitle("Test corpus: Child - entire dataset,\nmain corpus: Own input data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
<- qbarplot(all_to_input, x = "Child", fill = "Traceback_success_cross", color = brewer.pal(3, "Greens")) + guides(fill = guide_legend(title = "Traceback success")) +
( p2a ggtitle("Test corpus: Child - entire dataset,\nmain corpus: Other child's input data") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) )
# export for publication:
# p1a | p2a
# ggsave("all_to_input.png", width = 14, height = 6)
In order to evaluate whether the traceback is significantly more successful in the within-corpus approch, we fit a mixed-effects logistic regression model using the traceback result as response variable.
We do this first for the mixed to mono data:
# first, we need a "long" dataframe so that the results for within-corpus and mixed-corpus are in different rows, rather than columns.
<- rbind(
model_input_fion mutate(select(fion_mixed, Utterance_clean, success_fion_mono), type = "within") %>% rename(success = success_fion_mono),
mutate(select(fion_mixed, Utterance_clean, success_silvie_mono), type = "cross") %>% rename(success = success_silvie_mono)
)
<- rbind(
model_input_silvie mutate(select(silvie_mixed, Utterance_clean, success_silvie_mono), type = "within") %>% rename(success = success_silvie_mono),
mutate(select(silvie_mixed, Utterance_clean, success_fion_mono), type = "cross") %>% rename(success = success_fion_mono)
)
# Use within-corpus as baseline category
$type <- factor(model_input_fion$type, levels = c("within", "cross"))
model_input_fion
$type <- factor(model_input_silvie$type, levels = c("within", "cross"))
model_input_silvie
<- glmer(success ~ type + (1 | Utterance_clean),
m_fion data = model_input_fion, family = "binomial")
summary(m_fion)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ type + (1 | Utterance_clean)
## Data: model_input_fion
##
## AIC BIC logLik deviance df.resid
## 6227.7 6248.3 -3110.9 6221.7 6999
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.4196 -0.0120 -0.0003 0.0044 8.8287
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance_clean (Intercept) 507.8 22.53
## Number of obs: 7002, groups: Utterance_clean, 3029
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.7815 0.2174 -40.39 <2e-16 ***
## typecross -7.6318 0.2746 -27.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## typecross 0.517
<- glmer(success ~ type + (1 | Utterance_clean),
m_silvie data = model_input_silvie, family = "binomial")
summary(m_silvie)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ type + (1 | Utterance_clean)
## Data: model_input_silvie
##
## AIC BIC logLik deviance df.resid
## 5596.9 5618.1 -2795.5 5590.9 8555
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.5157 -0.0101 -0.0004 -0.0004 5.8686
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance_clean (Intercept) 381 19.52
## Number of obs: 8558, groups: Utterance_clean, 4095
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.1552 0.2116 -43.26 <2e-16 ***
## typecross -6.2998 0.2982 -21.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## typecross 0.494
And now, the same for the mixed to input data.
<- rbind(
model_fion_input filter(fion_mwu, type == "mixed") %>% select(Utterance_clean, success_fion_input) %>% mutate(Type = "within") %>% rename(success = success_fion_input),
filter(fion_mwu, type == "mixed") %>% select(Utterance_clean, success_silvie_input) %>% mutate(Type = "cross") %>% rename(success = success_silvie_input)
)
<- rbind(
model_silvie_input filter(silvie_mwu, type == "mixed") %>% select(Utterance_clean, success_silvie_input) %>% mutate(Type = "within") %>% rename(success = success_silvie_input),
filter(silvie_mwu, type == "mixed") %>% select(Utterance_clean, success_fion_input) %>% mutate(Type = "cross") %>% rename(success = success_fion_input)
)
# within-corpus as baseline category
$Type <- factor(model_fion_input$Type, levels = c("within", "cross"))
model_fion_input
$Type <- factor(model_silvie_input$Type, levels = c("within", "cross"))
model_silvie_input
<- glmer(success ~ Type + (1 | Utterance_clean),
m_fion_input data = model_fion_input, family = "binomial")
summary(m_fion_input)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ Type + (1 | Utterance_clean)
## Data: model_fion_input
##
## AIC BIC logLik deviance df.resid
## 6578.2 6598.8 -3286.1 6572.2 6999
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9517 -0.0150 -0.0016 0.0474 3.2505
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance_clean (Intercept) 461.6 21.49
## Number of obs: 7002, groups: Utterance_clean, 3029
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.2998 0.2222 -37.35 <2e-16 ***
## Typecross -4.4587 0.3171 -14.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Typecross 0.488
<- glmer(success ~ Type + (1 | Utterance_clean),
m_silvie_input data = model_silvie_input, family = "binomial")
summary(m_silvie_input)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ Type + (1 | Utterance_clean)
## Data: model_silvie_input
##
## AIC BIC logLik deviance df.resid
## 6502.7 6523.8 -3248.3 6496.7 8555
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1642 -0.0085 -0.0004 0.0080 5.2288
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance_clean (Intercept) 641.4 25.33
## Number of obs: 8558, groups: Utterance_clean, 4095
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4870 0.2273 -41.74 <2e-16 ***
## Typecross -6.1614 0.4611 -13.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Typecross 0.593
And finally, we apply the same analysis to the traceback based on all the child’s utterances (rather than just the code-mixed ones).
# input for the two models:
<- rbind(fion_mwu %>% select(Utterance_clean, success_fion_input) %>% set_names("Utterance", "success") %>% mutate(Type = "within"),
m_all_input_fion %>% select(Utterance_clean, success_silvie_input) %>% set_names("Utterance", "success") %>% mutate(Type = "cross"))
fion_mwu
<- rbind(silvie_mwu %>% select(Utterance_clean, success_silvie_input) %>% set_names("Utterance", "success") %>% mutate(Type = "within"),
m_all_input_silvie %>% select(Utterance_clean, success_fion_input) %>% set_names("Utterance", "success") %>% mutate(Type = "cross"))
silvie_mwu
# change factor levels
$Type <- factor(m_all_input_fion$Type, levels = c("within", "cross"))
m_all_input_fion
$Type <- factor(m_all_input_silvie$Type, levels = c("within", "cross"))
m_all_input_silvie
# models:
<- glmer(success ~ Type + (1 | Utterance), family = "binomial", data = m_all_input_fion)
m_all_fion summary(m_all_fion)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ Type + (1 | Utterance)
## Data: m_all_input_fion
##
## AIC BIC logLik deviance df.resid
## 54326.1 54353.2 -27160.0 54320.1 61509
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4134 -0.2082 0.1045 0.1919 1.3415
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance (Intercept) 21 4.583
## Number of obs: 61512, groups: Utterance, 21729
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.57460 0.05120 30.76 <2e-16 ***
## Typecross -1.20017 0.03351 -35.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Typecross -0.436
<- glmer(success ~ Type + (1 | Utterance), family = "binomial", data = m_all_input_silvie)
m_all_silvie summary(m_all_silvie)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: success ~ Type + (1 | Utterance)
## Data: m_all_input_silvie
##
## AIC BIC logLik deviance df.resid
## 54044.2 54070.9 -27019.1 54038.2 54555
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.35995 -0.29532 0.05806 0.19750 1.42113
##
## Random effects:
## Groups Name Variance Std.Dev.
## Utterance (Intercept) 21.65 4.653
## Number of obs: 54558, groups: Utterance, 23145
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.17832 0.04799 -3.716 0.000203 ***
## Typecross -1.25671 0.03468 -36.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Typecross -0.234
For the paper, we combine all coefficients in one table:
rbind(
summary(m_fion)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> mono"),
summary(m_silvie)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> mono"),
summary(m_fion_input)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> input"),
summary(m_silvie_input)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> input")
%>% pretty_df() # %>% write_excel_csv("model_coef.csv") )
rbind(
summary(m_all_fion)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> input"),
summary(m_all_silvie)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> input")
%>% pretty_df() # %>% write_excel_csv("model_coef2.csv") )
In addition to the traceback approach, we want to analyze the bigrams in the code-mixed data using a method adapted from Silge & Robinson. We start from scratch by reading in the code-mixed data (and only the code-mixed data) again and adding additional information, especially the more coarse-grained periodization of the data that we will use to track each child’s language development. We reduce the data to the multi-word units.
# read data
<- read_csv("../master/fion_mixed.csv") d_fion
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Filename = col_character(),
## Age = col_character(),
## Month = col_character(),
## Utt_no = col_double(),
## Speaker = col_character(),
## Utterance = col_character(),
## Lang_Tags = col_character(),
## Utterance_clean = col_character(),
## type = col_character(),
## Wordcount = col_double(),
## gram1 = col_character(),
## gram2 = col_character(),
## gram3 = col_character(),
## gram4 = col_character()
## )
<- read_csv("../master/silvie_mixed.csv") d_silvie
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Filename = col_character(),
## Age = col_character(),
## Month = col_character(),
## Utt_no = col_double(),
## Speaker = col_character(),
## Utterance = col_character(),
## Lang_Tags = col_character(),
## Utterance_clean = col_character(),
## type = col_character(),
## Wordcount = col_double(),
## gram1 = col_character(),
## gram2 = col_character(),
## gram3 = col_character(),
## gram4 = col_character()
## )
# add wordcount
$wordcount <- sapply(1:nrow(d_fion),
d_fionfunction(i) wordcount(d_fion$Utterance_clean[i]))
$wordcount <- sapply(1:nrow(d_silvie),
d_silviefunction(i) wordcount(d_silvie$Utterance_clean[i]))
# bin months
$Months <- character(nrow(d_fion))
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[1:7], "02;03-02;09", d_fion$Months)
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[8:14], "02;10-03;04", d_fion$Months)
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[15:21], "03;05-03;11", d_fion$Months)
d_fion
$Months <- character(nrow(d_silvie))
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[1:6], "02;03-02;09", d_silvie$Months)
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[7:12], "02;10-03;03", d_silvie$Months)
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[13:28], "03;04-03;19", d_silvie$Months)
d_silvie
# only multi-word units
<- filter(d_fion, wordcount > 1)
mwu_fion <- filter(d_silvie, wordcount > 1) mwu_silvie
Next, we get the bigrams and count them.
# get bigrams
<- mwu_fion %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)
bigrams_fion <- mwu_silvie %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)
bigrams_silvie
# one column for each word
<- bigrams_fion %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_fion <- bigrams_silvie %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_silvie
# add child column
<- mutate(bigrams_fion, Child = "Fion")
bigrams_fion <- mutate(bigrams_silvie, Child = "Silvie")
bigrams_silvie
# add together
<- rbind(bigrams_fion, bigrams_silvie)
bigrams
# count
<- bigrams %>% group_by(Child, Months) %>% count(word1, word2, sort = T) bigrams_count
And finally, we visualize them. First, Fion:
# a) for Fion
<- list()
my_plots
for(i in 1:length(levels(factor(d_fion$Months)))) {
# code from Silge & Robinson, Tidy Text Mining with R, CC-BY-NC-3.0,
# https://www.tidytextmining.com/ngrams.html
# check if there are data
<- bigrams_count %>%
l filter(n > 5, Child == "Fion",
== levels(factor(d_fion$Months))[i])
Months
if(nrow(l) > 0) {
# get bigram graph edges and vertices
<- bigrams_count %>%
bigram_graph filter(n > 5, Child == "Fion",
== levels(factor(d_fion$Months))[i]) %>%
Months %>% select(word1, word2, n) %>% graph_from_data_frame()
ungroup
# add arrow
<- grid::arrow(type = "closed", length = unit(.15, "inches"))
a
# plot
set.seed(2020)
<- ggraph(bigram_graph, layout = "fr") +
( p geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
ggtitle(paste0("Fion", ", ", levels(factor(bigrams_count$Months))[i])) +
theme_void() +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
theme(plot.margin = unit(c(1,1,1,1), "cm")) )
# get current number of plots
<- length(my_plots)
lng
+1]] <- p
my_plots[[lng
}
}
1]] my_plots[[
2]] my_plots[[
3]] my_plots[[
And then Silvie:
<- list()
my_plots2
for(i in 1:length(levels(factor(d_silvie$Months)))) {
# code from Silge & Robinson, Tidy Text Mining with R, CC-BY-NC-3.0,
# https://www.tidytextmining.com/ngrams.html
# check if there are data
<- bigrams_count %>%
l filter(n > 5, Child == "Silvie",
== levels(factor(d_silvie$Months))[i])
Months
if(nrow(l) > 0) {
# get bigram graph edges and vertices
<- bigrams_count %>%
bigram_graph filter(n > 5, Child == "Silvie",
== levels(factor(d_silvie$Months))[i]) %>%
Months %>% select(word1, word2, n) %>% graph_from_data_frame()
ungroup
# add arrow
<- grid::arrow(type = "closed", length = unit(.15, "inches"))
a
# plot
set.seed(2020)
<- ggraph(bigram_graph, layout = "fr") +
( p geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
ggtitle(paste0("Silvie", ", ", levels(factor(bigrams_count$Months))[i])) +
theme_void() +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
theme(plot.margin = unit(c(1,1,1,1), "cm")) )
# get current number of plots
<- length(my_plots2)
lng
+1]] <- p
my_plots2[[lng
}
}
1]] my_plots2[[
2]] my_plots2[[
3]] my_plots2[[
(Note: The colors indicating the different languages have been added manually using Inkscape).
This page was created using R Markdown using the Bootstrap theme flatly under the MIT License. All material in this repository is licensed under CC-BY 4.0.