The building blocks of code-mixing

Introduction

This document contains the source code of Quick & Hartmann (under review). A small sample of the data used in the paper is available here: https://github.com/hartmast/Quick_Hartmann_2021_Building_Blocks. We plan to publish the entire corpus in due time, but currently it still has to be revised in order to make sure that the data are fully anonymized.

Packages

First we load a number of packages.

# install non-CRAN package "wizard" if not yet installed
if(!require("devtools")) install.packages("devtools")
if(!require("wizard")) devtools::install_github("hartmast/wizard")

library(tidyverse)
library(tidytext)
library(ngram)
library(quanteda)
library(parallel)
library(pbmcapply)
library(wizard)
library(patchwork)
library(RColorBrewer)
library(lme4)
library(effects)
library(lmerTest)
library(igraph)
library(ggraph)
library(patchwork)
library(svglite)

Helper functions

We define a first helper function that searches for entire words (i.e. strings preceded and followed by either a whitespace or the beginning or end of the element.)

# convenience function: grep whole word
grepw <- function(pattern, x, ...) {
  grep(paste("(^| )", pattern, "(?=( |$))|(?<= |^)", pattern, "( |$)", sep="", collapse=""), x, perl = T, ...)
}

# the same but returns binary result (like grepl):
greplw <- function(pattern, x, ...) {
  grepl(paste("(^| )", pattern, "(?=( |$))|(?<= |^)", pattern, "( |$)", sep="", collapse=""), x, perl = T, ...)
}

The next two helper functions are jointly used to generate the pattern candidates from the data. The first replaces the n-th word in a string by XXX, the second simply gets the indices of words to be replaced. Each word and each bigram is supposed to be replaced once in the resulting list of pattern candidates, i.e. if we have an utterance consisting of 4 words, the results would be 1, 2, 3, 4, c(1,2), c(2,3), and c(3,4). The two functions are combined in all_xxx, which returns the list of pattern candidates as shown in the example below. In addition, it appends the original utterance as the last element of the list.

# function for replacing n-th word by xxx
xxx_this <- function(x, n) {
  x <- unlist(strsplit(x, " "))
  x[n] <- "XXX"
  x <- paste0(x, collapse = " ")
  return(x)
}

# function for required replacement indices
xxx_l <- function(x) {
  x <- unlist(strsplit(x, " "))
  l1 <- as.list(1:length(x))
  
  if(length(x) > 1) {
    l2 <- lapply(1:(length(x)-1), function(i) c(i:(i+1)))
    l  <- c(l1, l2)
  } else {
    l <- l1
  }
   
  return(l)
  
}


# combine both functions: get all possible
# replacements of 1 or 2 words,
# then all possible 1- to 4-grams

all_xxx <- function(x) {
  
  # collector
  my_list <- list()
  
  # iterate over all vectors
  for(i in 1:length(x)) {
    l_cur <- xxx_l(x[i])
    y <-  lapply(1:length(l_cur), 
                function(j) xxx_this(x[i], l_cur[[j]]))
    y[[length(y)+1]] <- x[i]
    my_list[[i]] <- y
  }
  
  return(my_list)
  
}

all_xxx("This is a test")

## [[1]]
## [[1]][[1]]
## [1] "XXX is a test"
## 
## [[1]][[2]]
## [1] "This XXX a test"
## 
## [[1]][[3]]
## [1] "This is XXX test"
## 
## [[1]][[4]]
## [1] "This is a XXX"
## 
## [[1]][[5]]
## [1] "XXX XXX a test"
## 
## [[1]][[6]]
## [1] "This XXX XXX test"
## 
## [[1]][[7]]
## [1] "This is XXX XXX"
## 
## [[1]][[8]]
## [1] "This is a test"

Another helper function identifies the longest consecutive string that is not XXX. This will be used to rank the pattern candidates: In line with previous traceback approaches, the ones with the longest consecutive strings are preferred.

# function for getting the longest subsequent string ----------------------
# (i.e. if there are multiple candidates of frame-
# and-slot patterns, identify the one with the longest
# subsequent fixed string, i.e. string that is not XXX)

get_successive_strings <- function(x) {
  x1 <- unlist(strsplit(x, " "))
  x2 <- which(x1 == "XXX") # find XXX
  x_before <- min(x2) - 1
  x_after <- length(x1) - max(x2)
  x3 <- max(x_before, x_after)
  return(x3)
}

And finally, it will also be important to find out which word has been replaced by XXX. This is the purpose of this last helper function.

# find out which word(s) has/have been replaced by XXX
which_xxx <- function(x) {
  x <- as.character(x)
  x <- sapply(1:length(x), 
              function(i) which(unlist(strsplit(x[i], " "))=="XXX"))
  return(x)
}

Read data

Next, we read in the datasets.

fion <- read_csv("../master/fion_CHI.csv")
fion_input <- read_csv("../master/fion_input.csv")
silvie <- read_csv("../master/silvie_CHI.csv")
silvie_input <- read_csv("../master/silvie_input.csv")

As single-word utterances are not interesting for our purposes, we reduce the children’s data to multi-word utterances.

# only multi-word units ---------------------------------------------------

fion_mwu <- filter(fion, Wordcount > 1)
silvie_mwu <- filter(silvie, Wordcount > 1)

Get pattern candidates

We now create a list of pattern candidates by applying the function defined above to all instances.

# to use the function more efficiently, we vectorize it
all_xxx_v <- Vectorize(all_xxx)

# now we apply it to all instances
# both of the input and the MWUs.
fion_input_xxx <- all_xxx_v(fion_input$Utterance_clean)
fion_mwu_xxx <- all_xxx_v(fion_mwu$Utterance_clean)
fion_mono_xxx <- all_xxx_v(filter(fion_mwu, type != "mixed")$Utterance_clean)
fion_mixed_xxx <- all_xxx_v(filter(fion_mwu, type == "mixed")$Utterance_clean)

silvie_input_xxx <- all_xxx_v(silvie_input$Utterance_clean)
silvie_mwu_xxx <- all_xxx_v(silvie_mwu$Utterance_clean)
silvie_mono_xxx <- all_xxx_v(filter(silvie_mwu, type != "mixed")$Utterance_clean)
silvie_mixed_xxx <- all_xxx_v(filter(silvie_mwu, type == "mixed")$Utterance_clean)

Traceback

This brings us to the heart of the traceback method: We check whether the full utterance or one of the detected patterns can be found in the main corpus, i.e. a) the input, b) the child’s monolingual data.

In a first step, we check whether there is a verbatim match in the main corpus. We use the multi-core version of lapply, pbclapply from the parallel package. The algorithm uses the greplw function defined above, which returns TRUE for all utterance in with the string we search for is attested, and checks whether it returns TRUE for any utterance in the respective main corpus. Here we work with utterance types instead of tokens as some utterances are used more than once.

# list of utterance types
utterance_types <- c(fion_input$Utterance_clean, fion_mwu$Utterance_clean, silvie$Utterance_clean, silvie_mwu$Utterance_clean) %>% unique

# data frame indicating for each utterance whether it occurs in the respective main corpus

utterance_types_matches <- tibble(
  Utterance_clean = utterance_types,
  exact_match_silvie_input = unlist(pbmclapply(1:length(utterance_types), function(i)
  any(greplw(utterance_types[i], silvie_input$Utterance_clean)), mc.cores = 6)),
  exact_match_fion_input = unlist(pbmclapply(1:length(utterance_types), function(i)
  any(greplw(utterance_types[i], fion_input$Utterance_clean)), mc.cores = 6))
)

We now join the new dataframe with the existing ones.

fion_mwu <- left_join(fion_mwu, utterance_types_matches)
silvie_mwu <- left_join(silvie_mwu, utterance_types_matches)

Now, we get the number of successive strings in each pattern using the function defined above. This information will later be used to rank the patterns.

# vectorized version:
get_successive_strings_v <- Vectorize(get_successive_strings)

# for Fion:
fion_mwu_xxx_successive <- pbmclapply(1:length(fion_mwu_xxx),
           function(i) suppressWarnings(get_successive_strings_v(fion_mwu_xxx[[i]])))

# for Silvie:
silvie_mwu_xxx_successive <- pbmclapply(1:length(silvie_mwu_xxx),
                                      function(i) suppressWarnings(get_successive_strings_v(silvie_mwu_xxx[[i]])))

We also have to find out which word(s) has/have been replaced in each pattern candidate, using the appropriate helper function defined above.

fion_mwu_which_xxx <- 
  pbmclapply(1:length(fion_mwu_xxx), 
             function(i) which_xxx(fion_mwu_xxx[[i]]), mc.cores = 6)

silvie_mwu_which_xxx <- 
  pbmclapply(1:length(silvie_mwu_xxx), 
             function(i) which_xxx(silvie_mwu_xxx[[i]]), mc.cores = 6)

As we want to know whether the omitted words occur in the main corpus, we compile a list of tokens and determine for each token whether or not it occurs in the monolingual and in the input data.

# get list of words and check if they are attested
# in a) Fion's monolingual data, b) Fion's input data,
# c) Silvie's monolingual data, d) Silvie' input data

all_tokens <- c(
  tokens(filter(silvie, type != "mixed")$Utterance_clean) %>% unlist %>% as.character() %>% unique,
  tokens(filter(fion, type != "mixed")$Utterance_clean) %>% unlist %>% as.character() %>% unique,
  tokens(silvie_input$Utterance_clean) %>% unlist %>% as.character() %>% unique,
  tokens(fion_input$Utterance_clean) %>% unlist %>% as.character() %>% unique
) %>% unique

## Warning: NA is replaced by empty string

all_tokens_check <- tibble(
  token = all_tokens,
  silvie_mono = all_tokens %in% as.character(unlist(tokens(filter(silvie, type != "mixed")$Utterance_clean))),
fion_mono = all_tokens %in% as.character(unlist(tokens(filter(fion, type != "mixed")$Utterance_clean))),
silvie_input = all_tokens %in% as.character(unlist(tokens(silvie_input$Utterance_clean))),
fion_input = all_tokens %in% as.character(unlist(tokens(fion_input$Utterance_clean)))
)

## Warning: NA is replaced by empty string

Now we have to check, for each pattern, whether the omitted word occurs in the respective main corpus. Otherwise, it will not be counted as a valid pattern.

For this purpose, we first define a helper function:

find_in_tokens <- function(x, col) {
  # if the token is "" (i.e. "character(0)"), proceed
  if(!any(identical(x, character(0)))) {
    
    # otherwise, find it in the list of tokens
    if(all(x %in% all_tokens)) {
      return(all(sapply(1:length(x), 
                        function(i) 
                          as.logical(all_tokens_check[which(all_tokens_check$token==x[i]), which(colnames(all_tokens_check)==col)]))))
    } else {
      return(FALSE)
    }
  } else {
    return(FALSE)
  }
  
}

For each word, we now want to know the omitted word(s). As in some patterns, one word has been replaced by XXX and in others two, we work with two subsets because if two words have been replaced, the pattern only counts as valid - in combination with the concrete utterance - if the two omitted words are attested consecutively in the main corpus.

# first, get tibble with all candidate patterns
pattern_candidates <- tibble(
  pattern = c(unlist(fion_mwu_xxx), unlist(silvie_mwu_xxx)) %>% as.character
)


# find the ones that don't have XXX in them
# (those are the targets)
no_xxx <- grep("XXX", pattern_candidates$pattern, invert = T)
pattern_candidates$target <- character(nrow(pattern_candidates))
pattern_candidates$target <- c(rep(pattern_candidates$pattern[no_xxx[1]], no_xxx[1]),
  unlist(sapply(2:length(no_xxx),
                function(i)
                  rep(pattern_candidates$pattern[no_xxx[i]],
                      nrow(pattern_candidates[no_xxx[i-1]:(no_xxx[i]-1),])))))

# remove no_xxx
pattern_candidates <- pattern_candidates[-no_xxx,]
 
# remove "XXX XXX" as this is not a valid pattern
pattern_candidates <- pattern_candidates[-which(pattern_candidates=="XXX XXX"),]

# get only the ones that contain "XXX" but not "XXX XXX" (i.e. those in which only one word has been replaced)
pattern_candidates1 <- pattern_candidates[grep("XXX XXX", pattern_candidates$pattern, invert = T),]

# and now the ones in which two words have been replaced
pattern_candidates2 <- pattern_candidates[grep("XXX XXX", pattern_candidates$pattern),]

Now we check whether the pattern candidates are attested in the main corpus.

pattern_candidates1$fion_input <-
  unlist(pbmclapply(1:nrow(pattern_candidates1),
                    function(i)                      find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_input"), mc.cores = 6))

pattern_candidates1$silvie_input <-
  unlist(pbmclapply(1:nrow(pattern_candidates1),
                    function(i)
 find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_input"), mc.cores = 6))
pattern_candidates1$fion <-
  unlist(pbmclapply(1:nrow(pattern_candidates1),
                    function(i)
                      find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion"), mc.cores = 6))
pattern_candidates1$silvie <-
  unlist(pbmclapply(1:nrow(pattern_candidates1),
                    function(i)
                      find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie"), mc.cores = 6))

pattern_candidates1$fion_mono <- unlist(pbmclapply(1:nrow(pattern_candidates1),
                     function(i)
                       find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_mono"), mc.cores = 6))
pattern_candidates1$fion_mixed <- unlist(pbmclapply(1:nrow(pattern_candidates1),
                                                  function(i)
                                                    find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "fion_mixed"), mc.cores = 6))
pattern_candidates1$silvie_mono <- unlist(pbmclapply(1:nrow(pattern_candidates1),
                                                  function(i)
                                                    find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_mono"), mc.cores = 6))
pattern_candidates1$silvie_mixed <- unlist(pbmclapply(1:nrow(pattern_candidates1),
                                                   function(i)
                                                     find_in_tokens(unlist(strsplit(as.character(pattern_candidates1[i,2]), " "))[which(unlist(strsplit(as.character(pattern_candidates1[i,1]), " "))=="XXX")], col = "silvie_mixed"), mc.cores = 6))

For the second batch of pattern candidates - those with 2 omitted words - we do the same but with bigrams

# add column with omitted bigrams
pattern_candidates2$omitted_bigram <- unlist(pbmclapply(1:nrow(pattern_candidates2), 
           function(i) 
             paste0(unlist(strsplit(pattern_candidates2$target[i], " "))[which(unlist(strsplit(pattern_candidates2$pattern[i], " ")) == "XXX")], collapse = " "), mc.cores = 6))

# search the omitted bigram in the data
pattern_candidates2$fion_input <- logical(nrow(pattern_candidates2))
pattern_candidates2[which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(fion_input$Utterance_clean), concatenator = " ")))),]$fion_input <- TRUE

pattern_candidates2$silvie_input <- logical(nrow(pattern_candidates2))
pattern_candidates2[which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(silvie_input$Utterance_clean), concatenator = " ")))),]$silvie_input <- TRUE

## Warning: NA is replaced by empty string

## Warning: NA is replaced by empty string

pattern_candidates2$fion_mono <- logical(nrow(pattern_candidates2))
pattern_candidates2[which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(filter(fion_mwu, type != "mixed")$Utterance_clean), concatenator = " ")))),]$fion_mono <- TRUE

pattern_candidates2$silvie_mono <- logical(nrow(pattern_candidates2))
pattern_candidates2[which(pattern_candidates2$omitted_bigram %in% as.character(unlist(tokens_ngrams(tokens(filter(silvie_mwu, type != "mixed")$Utterance_clean), concatenator = " ")))),]$silvie_mono <- TRUE

Now we combine the two:

pattern_candidates <- rbind(pattern_candidates1, pattern_candidates2[,-which(colnames(pattern_candidates2)=="omitted_bigram")])

Now we have to find out how often each pattern occurs in the respective main corpus, as we work with a frequency threshold of 2. For this purpose, we work with pattern types instead of tokens.

# list of pattern types
pattern_types <- tibble( 
  pattern = unique(pattern_candidates$pattern)
)


# get successive strings in all pattern candidates
pattern_types$successive <- pbmclapply(1:nrow(pattern_types),
           function(i)
             get_successive_strings_v(pattern_types$pattern[i]))

pattern_types$successive <- as.numeric(unlist(pattern_types$successive))

# find number of pattern matches in the four corpora
pattern_types$pattern_match_fion_input <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
  length(grepw(gsub("XXX( XXX)?", ".+",
                    pattern_types$pattern[i]),
               fion_input$Utterance_clean)), mc.cores = 6))
pattern_types$pattern_match_silvie_input <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
  length(grepw(gsub("XXX( XXX)?", ".+",
                    pattern_types$pattern[i]),
               silvie_input$Utterance_clean)), mc.cores = 6))
pattern_types$pattern_match_silvie <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
  length(grepw(gsub("XXX( XXX)?", ".+",
                    pattern_types$pattern[i]),
               silvie$Utterance_clean)), mc.cores = 6))
pattern_types$pattern_match_silvie_mono <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
   length(grepw(gsub("XXX( XXX)?", ".+",
                     pattern_types$pattern[i]),
                filter(silvie, type %in% c("english", "german"))$Utterance_clean)), mc.cores = 6))
pattern_types$pattern_match_fion_mono <- unlist(pbmclapply(1:nrow(pattern_types), function(i)
   length(grepw(gsub("XXX( XXX)?", ".+",
                     pattern_types$pattern[i]),
                filter(fion, type %in% c("english", "german"))$Utterance_clean)), mc.cores = 6))

# merge with pattern_candidates
pattern_candidates <- left_join(pattern_candidates, pattern_types)

Now we check for every target utterance if the traceback is successful by checking whether a) the candidate pattern is attested at least twice in the main corpus, b) the omitted word(s) is/are attested in the main corpus. In many cases, more than one pattern candidate is valid; in these cases, pattern candidates with longer subsequent strings and pattern candidates with utterance-initial fixed strings (rather than XXX) are preferred, i.e. [this is XXX] is preferred over [this XXX cake] (longer consecutive fixed strings: 2 words vs. 1 word) and over [XXX is cake] (utterance-initial vs. utterance-internal/-final open slot).

# add column indicating traceback success &
# column indicating pattern
fion_mwu$success_fion_input <- logical(nrow(fion_mwu))
fion_mwu$pattern_fion_input <- character(nrow(fion_mwu))

silvie_mwu$success_silvie_input <- logical(nrow(silvie_mwu))
silvie_mwu$pattern_silvie_input <- character(nrow(silvie_mwu))

fion_mwu$success_silvie_input <- logical(nrow(fion_mwu))
fion_mwu$pattern_silvie_input <- character(nrow(fion_mwu))

silvie_mwu$success_fion_input <- logical(nrow(silvie_mwu))
silvie_mwu$pattern_fion_input <- character(nrow(silvie_mwu))

# fill those columns for the rows that
# have already been identified as exact matches
fion_mwu[which(fion_mwu$exact_match_fion_input==TRUE),]$success_fion_input <- TRUE
fion_mwu[which(fion_mwu$exact_match_fion_input==TRUE),]$pattern_fion_input <-
  fion_mwu[which(fion_mwu$exact_match_fion_input==TRUE),]$Utterance_clean

silvie_mwu[which(silvie_mwu$exact_match_silvie_input==TRUE),]$success_silvie_input <- TRUE
silvie_mwu[which(silvie_mwu$exact_match_silvie_input==TRUE),]$pattern_silvie_input <-
  silvie_mwu[which(silvie_mwu$exact_match_silvie_input==TRUE),]$Utterance_clean

fion_mwu[which(fion_mwu$exact_match_silvie_input==TRUE),]$success_silvie_input <- TRUE
fion_mwu[which(fion_mwu$exact_match_silvie_input==TRUE),]$pattern_silvie_input <-
  fion_mwu[which(fion_mwu$exact_match_silvie_input==TRUE),]$Utterance_clean


silvie_mwu[which(silvie_mwu$exact_match_fion_input==TRUE),]$success_fion_input <- TRUE
silvie_mwu[which(silvie_mwu$exact_match_fion_input==TRUE),]$pattern_fion_input <-
  silvie_mwu[which(silvie_mwu$exact_match_fion_input==TRUE),]$Utterance_clean

We define a function that allows us to trace back the frame-and-slot patterns. As arguments, it requires a) the test corpus (e.g. the data frame fion_mwu), b) the main corpus (e.g. fion_input), c) the pattern list

traceback <- function(test_corpus, main_corpus, xxx) {
  for(k in 1:nrow(test_corpus)) {
    
  # column name for checking verbatim match in main corpus
  col_em <- which(colnames(test_corpus)==paste0("exact_match_", main_corpus))
    
  if(test_corpus[k, col_em] == FALSE) {
    
    # list of patterns:
    l_cur <- xxx[[k]]
    
    # the last one (= literal utterance/target)
    # can be discarded
    target <- l_cur[length(l_cur)]
    target <- unlist(target)
    l_cur <- l_cur[1:(length(l_cur)-1)]
    
    
    # unlist
    l_cur <- unlist(l_cur)
    
    # add info from pattern_candidates table
    l_cur <- tibble(pattern = l_cur, target = target)
    l_cur <- left_join(l_cur, pattern_candidates)
    
    # keep only those with pattern_match >= 2 in the
    # target corpus & token_match = TRUE in the target
    # corpus
    
    if(main_corpus == "fion_input") {
      l_cur <- filter(l_cur, fion_input == TRUE & 
                      pattern_match_fion_input >= 2)
    } else if(main_corpus == "silvie_input") {
      l_cur <- filter(l_cur, silvie_input == TRUE & 
                      pattern_match_silvie_input >= 2)
    } else if(main_corpus == "fion_mono") {
      l_cur <- filter(l_cur, fion_mono == TRUE & 
                      pattern_match_fion_mono >= 2)
    } else if(main_corpus=="silvie_mono") {
      l_cur <- filter(l_cur, silvie_mono == TRUE & 
                      pattern_match_silvie_mono >= 2)
    }
    
    
    
    if(nrow(l_cur) > 0) {
      # specify if open slot is utterance-initial (if so,
      # it will be dispreferred)
      l_cur$utterance_initial_xxx <- grepl("^XXX", l_cur$pattern)
      l_cur %>% arrange(utterance_initial_xxx)
    }
    
    # sort by length of successive strings
    if(nrow(l_cur) > 0) {
      l_cur <- l_cur %>% arrange(desc(successive))
    }
    
    # get the relevant columns for adding the results
    col1 <- which(colnames(test_corpus) == paste0("success_", main_corpus))
    col2 <- which(colnames(test_corpus) == paste0("pattern_", main_corpus))
    
    # the first one wins out
    if(nrow(l_cur) > 0) {
      test_corpus[k, col1] <- TRUE
      test_corpus[k, col2] <- l_cur$pattern[1]
    } else {
      test_corpus[k, col1] <- FALSE
    }

    
    
  } 
  
  print(k)
  
  }
  
  return(test_corpus)
}

As we want to use the code-mixed data as test corpus, we create a data frame only with the code-mixed utterances.

# subset with code-mixed utterances
fion_mixed <- filter(fion_mwu, type == "mixed")
silvie_mixed <- filter(silvie_mwu, type == "mixed")

# empty columns for success values and patterns
# (we don't have to bother about exact matches here:
# there are none, as code-mixed and monolingual data
# are two entirely distinct sets)

fion_mixed$success_fion_mono <- logical(nrow(fion_mixed))
fion_mixed$success_silvie_mono <- logical(nrow(fion_mixed))
fion_mixed$exact_match_fion_mono <- FALSE
fion_mixed$exact_match_silvie_mono <- FALSE


silvie_mixed$success_silvie_mono <- logical(nrow(silvie_mixed))
silvie_mixed$success_fion_mono <- logical(nrow(silvie_mixed))
silvie_mixed$exact_match_silvie_mono <- FALSE
silvie_mixed$exact_match_fion_mono <- FALSE

Now we apply the function to the cases we are interested in.

# Fion --> Own input
fion_mwu <- traceback(test_corpus = fion_mwu, main_corpus = "fion_input", xxx = fion_mwu_xxx)

# Silvie --> Own input
silvie_mwu <- traceback(test_corpus = silvie_mwu, main_corpus = "silvie_input", xxx = silvie_mwu_xxx)

# Fion --> Silvie's input
fion_mwu <- traceback(test_corpus = fion_mwu, main_corpus = "silvie_input", xxx = fion_mwu_xxx)

# Silvie --> Fion's input
silvie_mwu <- traceback(test_corpus = silvie_mwu, main_corpus = "fion_input", xxx = silvie_mwu_xxx)

# Fion mixed --> Own monolingual data
fion_mixed <- traceback(test_corpus = fion_mixed, main_corpus = "fion_mono", xxx = fion_mixed_xxx)

# Silvie mixed --> Own monolingual data
silvie_mixed <- traceback(test_corpus = silvie_mixed, main_corpus = "silvie_mono", xxx = silvie_mixed_xxx)

# Fion mixed --> Silvie's monolingual data
fion_mixed <- traceback(test_corpus = fion_mixed, main_corpus = "silvie_mono", xxx = fion_mixed_xxx)

# Silvie mixed --> Fion's monolingual data
silvie_mixed <- traceback(test_corpus = silvie_mixed, main_corpus = "fion_mono", xxx = silvie_mixed_xxx)

Visualization

For visualization, we use the qbarplot function from the wizard package. As we want to get a ternary result (fixed string, frame+slot pattern, or fail), we first define a function that allows adding a column thct contains this information.

# function for getting traceback success
# as a ternary variable
tb_ternary <- function(x, col1, col2) {
  
  # add ternary traceback success
  x1 <- sapply(1:nrow(x), function(i) 
    if(x[i,which(colnames(x)==col1)] == TRUE 
       & x[i, which(colnames(x) == col2)] == TRUE ) "exact match" 
    else if (x[i,which(colnames(x)==col1)] == FALSE & 
             x[i, which(colnames(x) == col2)] == TRUE ) "frame + slot" else
               "fail")
  
  x1 <- factor(x1, levels = c("fail", "frame + slot", "exact match"))
  x1 <- droplevels(x1)
  
  return(x1)
}




# Child_mixed --> Own monolingual data

( p1 <- rbind(mutate(select(silvie_mixed, Month), 
       Child = "Silvie", Traceback_success = tb_ternary(silvie_mixed, "exact_match_silvie_mono", "success_silvie_mono")),
      mutate(select(fion_mixed, Month), 
             Child = "Fion", Traceback_success = tb_ternary(fion_mixed, "exact_match_fion_mono", "success_fion_mono"))) %>%
  qbarplot(x = "Child", fill = Traceback_success) +
  guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - mixed,\nmain corpus: Own monolingual data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

( p2 <- rbind(mutate(select(silvie_mixed, Month), 
       Child = "Silvie", Traceback_success = tb_ternary(silvie_mixed, "exact_match_fion_mono", "success_fion_mono")),
      mutate(select(fion_mixed, Month), 
             Child = "Fion", Traceback_success = tb_ternary(fion_mixed, "exact_match_silvie_mono", "success_silvie_mono"))) %>%
  qbarplot(x = "Child", fill = Traceback_success, 
           color = brewer.pal(3, "Greens")[2:3]) +
  guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - mixed,\nmain corpus: Other child's monolingual data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

# Child - mixed --> Input data

mixed_to_input <- rbind(
  select(filter(fion_mwu, type == "mixed"), Month, exact_match_fion_input, success_fion_input, exact_match_silvie_input, success_silvie_input) %>% mutate (Child = "Fion", Traceback_success_within = tb_ternary(filter(fion_mwu, type == "mixed"), "exact_match_fion_input", "success_fion_input"), Traceback_success_cross = tb_ternary(filter(fion_mwu, type == "mixed"), "exact_match_silvie_input", "success_silvie_input")) %>% rename(exact_match_within = exact_match_fion_input, success_within = success_fion_input, exact_match_cross = exact_match_silvie_input, success_cross = success_silvie_input),



select(filter(silvie_mwu, type == "mixed"), Month, exact_match_silvie_input, success_silvie_input, exact_match_fion_input, success_fion_input) %>% mutate (Child = "Silvie", Traceback_success_within = tb_ternary(filter(silvie_mwu, type == "mixed"), "exact_match_silvie_input", "success_silvie_input"), Traceback_success_cross = tb_ternary(filter(silvie_mwu, type == "mixed"), "exact_match_fion_input", "success_fion_input")) %>% rename(exact_match_within = exact_match_silvie_input, success_within = success_silvie_input, exact_match_cross = exact_match_fion_input, success_cross = success_fion_input)
  
)


( p3 <- qbarplot(mixed_to_input, x = "Child", fill = "Traceback_success_within") + guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - mixed,\nmain corpus: Own input data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

( p4 <- qbarplot(mixed_to_input, x = "Child", fill = "Traceback_success_cross",  color = brewer.pal(3, "Greens")) + guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - mixed,\nmain corpus: Other child's input data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

# export
# p1 | p2
# ggsave("mixed_to_mono.png", width = 14, height = 6)
# 
# p3 | p4
# ggsave("mixed_to_input.png", width = 14, height = 6)

As the traceback algorithm has been applied to the entire dataset, not just the code-mixed data, we are now interested in the results for the full dataset.

all_to_input <- rbind(
  select(fion_mwu, Month, exact_match_fion_input, success_fion_input, exact_match_silvie_input, success_silvie_input) %>% mutate (Child = "Fion", Traceback_success_within = tb_ternary(fion_mwu, "exact_match_fion_input", "success_fion_input"), Traceback_success_cross = tb_ternary(fion_mwu, "exact_match_silvie_input", "success_silvie_input")) %>% rename(exact_match_within = exact_match_fion_input, success_within = success_fion_input, exact_match_cross = exact_match_silvie_input, success_cross = success_silvie_input),

select(silvie_mwu, Month, exact_match_silvie_input, success_silvie_input, exact_match_fion_input, success_fion_input) %>% mutate (Child = "Silvie", Traceback_success_within = tb_ternary(silvie_mwu, "exact_match_silvie_input", "success_silvie_input"), Traceback_success_cross = tb_ternary(silvie_mwu, "exact_match_fion_input", "success_fion_input")) %>% rename(exact_match_within = exact_match_silvie_input, success_within = success_silvie_input, exact_match_cross = exact_match_fion_input, success_cross = success_fion_input)
  
)


( p1a <- qbarplot(all_to_input, x = "Child", fill = "Traceback_success_within") + guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - entire dataset,\nmain corpus: Own input data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

( p2a <- qbarplot(all_to_input, x = "Child", fill = "Traceback_success_cross",  color = brewer.pal(3, "Greens")) + guides(fill = guide_legend(title = "Traceback success")) +
  ggtitle("Test corpus: Child - entire dataset,\nmain corpus: Other child's input data") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) )

# export for publication:
# p1a | p2a
# ggsave("all_to_input.png", width = 14, height = 6)

Regression models

In order to evaluate whether the traceback is significantly more successful in the within-corpus approch, we fit a mixed-effects logistic regression model using the traceback result as response variable.

We do this first for the mixed to mono data:

# first, we need a "long" dataframe so that the results for within-corpus and mixed-corpus are in different rows, rather than columns.

model_input_fion <- rbind(
  mutate(select(fion_mixed, Utterance_clean, success_fion_mono), type = "within") %>% rename(success = success_fion_mono),
  mutate(select(fion_mixed, Utterance_clean, success_silvie_mono), type = "cross") %>% rename(success = success_silvie_mono)
  
)

model_input_silvie <- rbind(
  mutate(select(silvie_mixed, Utterance_clean, success_silvie_mono), type = "within") %>% rename(success = success_silvie_mono),
  mutate(select(silvie_mixed, Utterance_clean, success_fion_mono), type = "cross") %>% rename(success = success_fion_mono)
  
)

# Use within-corpus as baseline category
model_input_fion$type <- factor(model_input_fion$type, levels = c("within", "cross"))

model_input_silvie$type <- factor(model_input_silvie$type, levels = c("within", "cross"))


m_fion <- glmer(success ~ type + (1 | Utterance_clean),
      data = model_input_fion, family = "binomial")
summary(m_fion)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ type + (1 | Utterance_clean)
##    Data: model_input_fion
## 
##      AIC      BIC   logLik deviance df.resid 
##   6227.7   6248.3  -3110.9   6221.7     6999 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.4196 -0.0120 -0.0003  0.0044  8.8287 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  Utterance_clean (Intercept) 507.8    22.53   
## Number of obs: 7002, groups:  Utterance_clean, 3029
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -8.7815     0.2174  -40.39   <2e-16 ***
## typecross    -7.6318     0.2746  -27.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## typecross 0.517

m_silvie <- glmer(success ~ type + (1 | Utterance_clean),
      data = model_input_silvie, family = "binomial")
summary(m_silvie)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ type + (1 | Utterance_clean)
##    Data: model_input_silvie
## 
##      AIC      BIC   logLik deviance df.resid 
##   5596.9   5618.1  -2795.5   5590.9     8555 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.5157 -0.0101 -0.0004 -0.0004  5.8686 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  Utterance_clean (Intercept) 381      19.52   
## Number of obs: 8558, groups:  Utterance_clean, 4095
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -9.1552     0.2116  -43.26   <2e-16 ***
## typecross    -6.2998     0.2982  -21.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## typecross 0.494

And now, the same for the mixed to input data.

model_fion_input <- rbind(
  filter(fion_mwu, type == "mixed") %>% select(Utterance_clean, success_fion_input) %>% mutate(Type = "within") %>% rename(success = success_fion_input),

filter(fion_mwu, type == "mixed") %>% select(Utterance_clean, success_silvie_input)  %>% mutate(Type = "cross") %>% rename(success = success_silvie_input)
)

model_silvie_input <- rbind(
  filter(silvie_mwu, type == "mixed") %>% select(Utterance_clean, success_silvie_input) %>% mutate(Type = "within") %>% rename(success = success_silvie_input),

filter(silvie_mwu, type == "mixed") %>% select(Utterance_clean, success_fion_input)  %>% mutate(Type = "cross") %>% rename(success = success_fion_input)
)

# within-corpus as baseline category
model_fion_input$Type <- factor(model_fion_input$Type, levels = c("within", "cross"))

model_silvie_input$Type <- factor(model_silvie_input$Type, levels = c("within", "cross"))

m_fion_input <- glmer(success ~ Type + (1 | Utterance_clean),
                      data = model_fion_input, family = "binomial")

summary(m_fion_input)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ Type + (1 | Utterance_clean)
##    Data: model_fion_input
## 
##      AIC      BIC   logLik deviance df.resid 
##   6578.2   6598.8  -3286.1   6572.2     6999 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9517 -0.0150 -0.0016  0.0474  3.2505 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  Utterance_clean (Intercept) 461.6    21.49   
## Number of obs: 7002, groups:  Utterance_clean, 3029
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -8.2998     0.2222  -37.35   <2e-16 ***
## Typecross    -4.4587     0.3171  -14.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## Typecross 0.488

m_silvie_input <- glmer(success ~ Type + (1 | Utterance_clean),
                      data = model_silvie_input, family = "binomial")

summary(m_silvie_input)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ Type + (1 | Utterance_clean)
##    Data: model_silvie_input
## 
##      AIC      BIC   logLik deviance df.resid 
##   6502.7   6523.8  -3248.3   6496.7     8555 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1642 -0.0085 -0.0004  0.0080  5.2288 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  Utterance_clean (Intercept) 641.4    25.33   
## Number of obs: 8558, groups:  Utterance_clean, 4095
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -9.4870     0.2273  -41.74   <2e-16 ***
## Typecross    -6.1614     0.4611  -13.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## Typecross 0.593

And finally, we apply the same analysis to the traceback based on all the child’s utterances (rather than just the code-mixed ones).

# input for the two models:
m_all_input_fion <- rbind(fion_mwu %>% select(Utterance_clean, success_fion_input) %>% set_names("Utterance", "success") %>% mutate(Type = "within"), 
                          fion_mwu %>% select(Utterance_clean, success_silvie_input) %>% set_names("Utterance", "success") %>% mutate(Type = "cross"))

m_all_input_silvie <- rbind(silvie_mwu %>% select(Utterance_clean, success_silvie_input) %>% set_names("Utterance", "success") %>% mutate(Type = "within"), 
                          silvie_mwu %>% select(Utterance_clean, success_fion_input) %>% set_names("Utterance", "success") %>% mutate(Type = "cross"))


# change factor levels
m_all_input_fion$Type <- factor(m_all_input_fion$Type, levels = c("within", "cross"))

m_all_input_silvie$Type <- factor(m_all_input_silvie$Type, levels = c("within", "cross"))


# models:
m_all_fion <- glmer(success ~ Type + (1 | Utterance), family = "binomial", data = m_all_input_fion)
summary(m_all_fion)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ Type + (1 | Utterance)
##    Data: m_all_input_fion
## 
##      AIC      BIC   logLik deviance df.resid 
##  54326.1  54353.2 -27160.0  54320.1    61509 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4134 -0.2082  0.1045  0.1919  1.3415 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  Utterance (Intercept) 21       4.583   
## Number of obs: 61512, groups:  Utterance, 21729
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.57460    0.05120   30.76   <2e-16 ***
## Typecross   -1.20017    0.03351  -35.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## Typecross -0.436

m_all_silvie <- glmer(success ~ Type + (1 | Utterance), family = "binomial", data = m_all_input_silvie)
summary(m_all_silvie)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: success ~ Type + (1 | Utterance)
##    Data: m_all_input_silvie
## 
##      AIC      BIC   logLik deviance df.resid 
##  54044.2  54070.9 -27019.1  54038.2    54555 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.35995 -0.29532  0.05806  0.19750  1.42113 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  Utterance (Intercept) 21.65    4.653   
## Number of obs: 54558, groups:  Utterance, 23145
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.17832    0.04799  -3.716 0.000203 ***
## Typecross   -1.25671    0.03468 -36.236  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## Typecross -0.234

Summary: Model coefficients for traceback of code-mixed data

For the paper, we combine all coefficients in one table:

rbind(
  
summary(m_fion)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> mono"),
summary(m_silvie)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> mono"),
summary(m_fion_input)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> input"),
summary(m_silvie_input)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> input")
) %>% pretty_df() # %>% write_excel_csv("model_coef.csv")

Summary: Model coefficients for traceback of all data to the input

rbind(
  
summary(m_all_fion)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Fion --> input"),
summary(m_all_silvie)$coef %>% as.data.frame %>% rownames_to_column() %>% mutate(model = "Silvie --> input")
) %>% pretty_df() # %>% write_excel_csv("model_coef2.csv")

Bigram plots

In addition to the traceback approach, we want to analyze the bigrams in the code-mixed data using a method adapted from Silge & Robinson. We start from scratch by reading in the code-mixed data (and only the code-mixed data) again and adding additional information, especially the more coarse-grained periodization of the data that we will use to track each child’s language development. We reduce the data to the multi-word units.

# read data 

d_fion <- read_csv("../master/fion_mixed.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Filename = col_character(),
##   Age = col_character(),
##   Month = col_character(),
##   Utt_no = col_double(),
##   Speaker = col_character(),
##   Utterance = col_character(),
##   Lang_Tags = col_character(),
##   Utterance_clean = col_character(),
##   type = col_character(),
##   Wordcount = col_double(),
##   gram1 = col_character(),
##   gram2 = col_character(),
##   gram3 = col_character(),
##   gram4 = col_character()
## )

d_silvie <- read_csv("../master/silvie_mixed.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Filename = col_character(),
##   Age = col_character(),
##   Month = col_character(),
##   Utt_no = col_double(),
##   Speaker = col_character(),
##   Utterance = col_character(),
##   Lang_Tags = col_character(),
##   Utterance_clean = col_character(),
##   type = col_character(),
##   Wordcount = col_double(),
##   gram1 = col_character(),
##   gram2 = col_character(),
##   gram3 = col_character(),
##   gram4 = col_character()
## )

# add wordcount
d_fion$wordcount <- sapply(1:nrow(d_fion), 
                           function(i) wordcount(d_fion$Utterance_clean[i]))


d_silvie$wordcount <- sapply(1:nrow(d_silvie), 
                           function(i) wordcount(d_silvie$Utterance_clean[i]))


# bin months
d_fion$Months <- character(nrow(d_fion))
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[1:7], "02;03-02;09", d_fion$Months)
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[8:14], "02;10-03;04", d_fion$Months)
d_fion$Months <- ifelse(d_fion$Month %in% levels(factor(d_fion$Month))[15:21], "03;05-03;11", d_fion$Months)

d_silvie$Months <- character(nrow(d_silvie))
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[1:6], "02;03-02;09", d_silvie$Months)
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[7:12], "02;10-03;03", d_silvie$Months)
d_silvie$Months <- ifelse(d_silvie$Month %in% levels(factor(d_silvie$Month))[13:28], "03;04-03;19", d_silvie$Months)



# only multi-word units
mwu_fion <- filter(d_fion, wordcount > 1)
mwu_silvie <- filter(d_silvie, wordcount > 1)

Next, we get the bigrams and count them.

# get bigrams 

bigrams_fion <- mwu_fion %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)
bigrams_silvie <- mwu_silvie %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)


# one column for each word
bigrams_fion <- bigrams_fion %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_silvie <- bigrams_silvie %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)


# add child column
bigrams_fion <- mutate(bigrams_fion, Child = "Fion")
bigrams_silvie <- mutate(bigrams_silvie, Child = "Silvie")

# add together
bigrams <- rbind(bigrams_fion, bigrams_silvie)

# count
bigrams_count <- bigrams %>% group_by(Child, Months) %>% count(word1, word2, sort = T)

And finally, we visualize them. First, Fion:

# a) for Fion

my_plots <- list()


for(i in 1:length(levels(factor(d_fion$Months)))) {
  
  # code from Silge & Robinson, Tidy Text Mining with R, CC-BY-NC-3.0,
  # https://www.tidytextmining.com/ngrams.html
  
  # check if there are data
  l <- bigrams_count %>%
    filter(n > 5, Child == "Fion",
           Months == levels(factor(d_fion$Months))[i])
  
  if(nrow(l) > 0) {
    # get bigram graph edges and vertices
    bigram_graph <- bigrams_count %>%
      filter(n > 5, Child == "Fion",
             Months == levels(factor(d_fion$Months))[i]) %>%
      ungroup %>% select(word1, word2, n) %>% graph_from_data_frame()
    
    # add arrow
    a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
    
    # plot
    set.seed(2020)
    ( p <- ggraph(bigram_graph, layout = "fr") +
        geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                       arrow = a, end_cap = circle(.07, 'inches')) +
        geom_node_point(color = "lightblue", size = 5) +
        geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
        ggtitle(paste0("Fion", ", ", levels(factor(bigrams_count$Months))[i])) +
        theme_void() + 
        theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
        theme(plot.margin = unit(c(1,1,1,1), "cm")) )
    
    
    # get current number of plots
    lng <- length(my_plots)
    
    my_plots[[lng+1]] <- p
    
  }

}


my_plots[[1]]

my_plots[[2]]

my_plots[[3]]

And then Silvie:

my_plots2 <- list()


for(i in 1:length(levels(factor(d_silvie$Months)))) {
  
  # code from Silge & Robinson, Tidy Text Mining with R, CC-BY-NC-3.0,
  # https://www.tidytextmining.com/ngrams.html
  
  # check if there are data
  l <- bigrams_count %>%
    filter(n > 5, Child == "Silvie",
           Months == levels(factor(d_silvie$Months))[i])
  
  if(nrow(l) > 0) {
    # get bigram graph edges and vertices
    bigram_graph <- bigrams_count %>%
      filter(n > 5, Child == "Silvie",
             Months == levels(factor(d_silvie$Months))[i]) %>%
      ungroup %>% select(word1, word2, n) %>% graph_from_data_frame()
    
    # add arrow
    a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
    
    # plot
    set.seed(2020)
    ( p <- ggraph(bigram_graph, layout = "fr") +
        geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                       arrow = a, end_cap = circle(.07, 'inches')) +
        geom_node_point(color = "lightblue", size = 5) +
        geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
        ggtitle(paste0("Silvie", ", ", levels(factor(bigrams_count$Months))[i])) +
        theme_void() + 
        theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
        theme(plot.margin = unit(c(1,1,1,1), "cm")) )
    
    
    # get current number of plots
    lng <- length(my_plots2)
    
    my_plots2[[lng+1]] <- p
    
  }
  
}

my_plots2[[1]]

my_plots2[[2]]

my_plots2[[3]]

(Note: The colors indicating the different languages have been added manually using Inkscape).

This page was created using R Markdown using the Bootstrap theme flatly under the MIT License. All material in this repository is licensed under CC-BY 4.0.