library(tidyverse)
library(concordances)
library(ngram)
library(ggsankey)
library(ggiraph)
library(collostructions) # available at sfla.ch
library(DT)
Let X verb Y
Preliminaries
Data available here
Packages etc.
Corpus
Query
The corpus was queried via the Corpus Workbench using the query
="let"] [pos="N.*|PP.*"][pos="V.*"][pos="DT"]?[pos="N.*"] [lemma
Reading data
# list files, read data
<- list.files("data_encow/", full.names = TRUE)
f <- lapply(1:length(f), function(i) readLines(f[i]))
d <- unlist(d)
d
# convert to utf8 to make sure that gsub works
<- iconv(d, to="utf-8")
d
# replace anything before and after keywords
<- gsub(".*<|>.*", "", d)
d
# wordcount
<- sapply(1:length(d), function(i) wordcount(d[i]))
wc
# to dataframe
<- tibble(
df let = sapply(1:length(d), function(i) unlist(strsplit(d[i], split = " "))[1]),
noun1 = sapply(1:length(d), function(i) unlist(strsplit(d[i], split = " "))[2]),
verb = sapply(1:length(d), function(i) unlist(strsplit(d[i], split = " "))[3]),
noun2 = ifelse(wc == 5, sapply(1:length(d), function(i) unlist(strsplit(d[i], split = " "))[5]), sapply(1:length(d), function(i) unlist(strsplit(d[i], split = " "))[4]))
)
Data wrangling
# DF with lemmas ("unknown" is replaced by the token)
<- tibble(
df_lemmas let = "let",
noun1 = sapply(1:nrow(df), function(i) ifelse(gsub(".*/", "", df$noun1[i]) == "(unknown)", gsub("/.*", "", df$noun1[i]), gsub(".*/", "", df$noun1[i]))),
verb = sapply(1:nrow(df), function(i) ifelse(gsub(".*/", "", df$verb[i]) == "(unknown)", gsub("/.*", "", df$verb[i]), gsub(".*/", "", df$verb[i]))),
noun2 = sapply(1:nrow(df), function(i) ifelse(gsub(".*/", "", df$noun2[i]) == "(unknown)", gsub("/.*", "", df$noun2[i]), gsub(".*/", "", df$noun2[i])))
)
# add counts
<- df_lemmas %>% group_by(let, noun1, verb, noun2) %>% mutate(count = n()) %>% ungroup() %>% group_by(noun1) %>% mutate(noun1_count = n()) %>% ungroup() %>% group_by(verb) %>% mutate(verb_count = n()) %>% ungroup() %>% group_by(noun2) %>% mutate(noun_count = n()) %>% ungroup()
df_lemmas_with_counts
# add column with minimum count
$min_count <- df_lemmas_with_counts[,6:8] %>% apply(., 1, min) df_lemmas_with_counts
We focus on the most well-attested items (at least one of noun1, verb or noun2 attested at least 1000 times)
<- filter(df_lemmas_with_counts, min_count >= 1000) %>%
p1 make_long(let, noun1, verb, noun2) %>% ggplot(aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
label = node,
fill = factor(node))) +
geom_sankey() +
geom_sankey_label(size = 2, color = "white", fill = "gray40") +
scale_fill_viridis_d() +
guides(fill = "none") +
theme_minimal() +
theme(axis.text = element_blank()) +
theme(axis.title = element_blank()) +
theme(strip.text = element_text(size = 18)) +
theme(legend.text = element_text(size = 18)) +
theme(legend.title = element_text(size = 18, face = "bold")) +
theme(text = element_text(size = 18)) +
theme(line = element_blank(),
title = element_blank())
# zoomable plot
girafe(
ggobj = p1,
options = list(
opts_zoom(min = 1, max = 60),
opts_toolbar(saveaspng = TRUE)
) )
Multiple covarying collexeme analysis
Finally, we use multiple covarying collexeme analysis (Stefanowitsch & Gries (2005)) to investigate the pattern…
select(df_lemmas, "noun1", "verb", "noun2") %>% group_by(noun1, verb, noun2) %>% summarise(
n = n()
%>% as.data.frame() %>% collex.covar.mult(raw = FALSE) %>%
) #head(100) %>%
::datatable() DT
… only verb and noun2:
select(df_lemmas, "verb", "noun2") %>% group_by(verb, noun2) %>% summarise(
n = n()
%>% as.data.frame() %>% collex.covar(raw = FALSE) %>%
) #head(100) %>%
::datatable() DT