session info:
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sonoma 14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.35 R6_2.5.1 fastmap_1.2.0 xfun_0.45
## [5] cachem_1.1.0 knitr_1.47 htmltools_0.5.8.1 rmarkdown_2.27
## [9] lifecycle_1.0.4 cli_3.6.2 sass_0.4.9 data.table_1.15.4
## [13] jquerylib_0.1.4 compiler_4.4.1 rstudioapi_0.16.0 tools_4.4.1
## [17] evaluate_0.24.0 bslib_0.7.0 yaml_2.3.8 rlang_1.1.4
## [21] jsonlite_1.8.8
Install and load packages
# install CRAN packages (if not yet installed)
sapply(c("tidyverse", "devtools", "readxl", "kableExtra", "openxlsx", "DT"), function(x) if(!is.element(x, installed.packages())) install.packages(x, dependencies = T, repos = "http://cran.us.r-project.org"))## $tidyverse
## NULL
##
## $devtools
## NULL
##
## $readxl
## NULL
##
## $kableExtra
## NULL
##
## $openxlsx
## NULL
##
## $DT
## NULL
# if this doesn't work, check sfla.ch for the package
if(!is.element("collostructions", installed.packages())) {
install.packages("https://sfla.ch/wp-content/uploads/2021/02/collostructions_0.2.0.tar.gz", repos = NULL)
}
# load packages
library(tidyverse)
library(collostructions) # available at sfla.ch
library(readxl)
library(openxlsx)
library(kableExtra)
library(DT)# read data ---------------------------------------------------------------
# read data
d <- read_csv("RBKxn_true_positives.csv")## Rows: 1011 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): X1, LexikalischerFrame, KE1, KEE, KE2, Lesart, Schwierigkeit, Genr...
## dbl (3): Koerzion, No, Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read dwds21 lemma list
dwds <- read_delim("dwds21_verb_lemma_list", delim = "\t",
quote = "",
col_names = c("Freq", "Lemma"))## Rows: 14476 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Lemma
## dbl (1): Freq
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get frequency list
d_tbl <- d$X1 %>% table %>% as_tibble() %>% setNames(c("Lemma", "Freq_kx")) %>% arrange(desc(Freq_kx))
d_tbl <- left_join(d_tbl, dwds)## Joining with `by = join_by(Lemma)`
d_tbl <- replace_na(d_tbl, list(Freq = 0, Freq_kx = 0))
# d_tbl <- d_tbl[,c(1,3,2)]
# remove instances where corpus frequency < construction frequency
d_tbl <- d_tbl[!(d_tbl$Freq < d_tbl$Freq_kx),]
# collexeme analysis with G2
collex_RBK <- collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), delta.p = T)
# add further association measures
collex_RBK <- left_join(collex_RBK,
select(collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), am = "odds"), COLLEX, COLL.STR.ODDS)) ## Joining with `by = join_by(COLLEX)`
## Rows: 850 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): X1, LexikalischerFrame, KEE, KE2, Lesart, Schwierigkeit, Genre, Bi...
## dbl (3): Koerzion, No, Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get frequency list
d_tbl <- d$X1 %>% table %>% as_tibble() %>% setNames(c("Lemma", "Freq_kx")) %>% arrange(desc(Freq_kx))
d_tbl <- left_join(d_tbl, dwds)## Joining with `by = join_by(Lemma)`
d_tbl <- replace_na(d_tbl, list(Freq = 0, Freq_kx = 0))
# d_tbl <- d_tbl[,c(1,3,2)]
# remove instances where corpus frequency < construction frequency
d_tbl <- d_tbl[!(d_tbl$Freq < d_tbl$Freq_kx),]
# collexeme analysis with G2
collex_RPV <- collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), delta.p = T)
# add further association measures
collex_RPV <- left_join(collex_RPV,
select(collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), am = "odds"), COLLEX, COLL.STR.ODDS))## Joining with `by = join_by(COLLEX)`
# distinctive collexeme analysis targeting frames
frames_dist_collex <- left_join(
rbk %>% select(LexikalischerFrame) %>% table %>% as.data.frame %>% setNames(c("Frame", "Freq_RM")),
d %>% select(LexikalischerFrame) %>% table %>% as.data.frame %>% setNames(c("Frame", "Freq_RPV"))
) %>%
# replace NAs by 0
replace_na(list(Freq_RM = 0, Freq_RPV = 0)) %>%
# distinctive collexeme analysis
collex.dist() ## Joining with `by = join_by(Frame)`
# add number of lexeme types for each frame
# in the construction with which it is associated
frames_dist_collex$n_types <- numeric(nrow(frames_dist_collex))
for(i in 1:nrow(frames_dist_collex)) {
if(as.character(frames_dist_collex[i,]$ASSOC)=="Freq_RM") {
frames_dist_collex$n_types[i] <- rbk[which(rbk$LexikalischerFrame==as.character(frames_dist_collex[i,]$COLLEX)),]$X1 %>% unique %>% length
} else {
frames_dist_collex$n_types[i] <- d[which(d$LexikalischerFrame==as.character(frames_dist_collex[i,]$COLLEX)),]$X1 %>% unique %>% length
}
}
#frames_dist_collex %>% round_this() %>% write.xlsx("distinctive_collexmes_frames.xlsx")
frames_dist_collex %>% DT::datatable()