Preliminaries

session info:

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sonoma 14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.35     R6_2.5.1          fastmap_1.2.0     xfun_0.45        
##  [5] cachem_1.1.0      knitr_1.47        htmltools_0.5.8.1 rmarkdown_2.27   
##  [9] lifecycle_1.0.4   cli_3.6.2         sass_0.4.9        data.table_1.15.4
## [13] jquerylib_0.1.4   compiler_4.4.1    rstudioapi_0.16.0 tools_4.4.1      
## [17] evaluate_0.24.0   bslib_0.7.0       yaml_2.3.8        rlang_1.1.4      
## [21] jsonlite_1.8.8

Install and load packages

# install CRAN packages (if not yet installed)
sapply(c("tidyverse", "devtools", "readxl", "kableExtra", "openxlsx",  "DT"), function(x) if(!is.element(x, installed.packages())) install.packages(x, dependencies = T, repos = "http://cran.us.r-project.org"))
## $tidyverse
## NULL
## 
## $devtools
## NULL
## 
## $readxl
## NULL
## 
## $kableExtra
## NULL
## 
## $openxlsx
## NULL
## 
## $DT
## NULL
# if this doesn't work, check sfla.ch for the package
if(!is.element("collostructions", installed.packages())) {
  install.packages("https://sfla.ch/wp-content/uploads/2021/02/collostructions_0.2.0.tar.gz", repos = NULL)
}


# load packages
library(tidyverse)
library(collostructions) # available at sfla.ch
library(readxl)
library(openxlsx)
library(kableExtra)
library(DT)

Helper function

# helper function: if column is numeric,
# format it in such a way that the first two
# decimal places are displayed

round_this <- function(df) {
  
  for(j in 1:ncol(df)) {
    if(is.numeric(df[,j])) {
      df[,j] <- formatC(df[,j], format = "f", digits = 2)
    }
  }
  
  return(df)
  
}

Simple collexeme analysis: Reflexive motion construction

# read data ---------------------------------------------------------------

# read data
d <- read_csv("RBKxn_true_positives.csv")
## Rows: 1011 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): X1, LexikalischerFrame, KE1, KEE, KE2, Lesart, Schwierigkeit, Genr...
## dbl  (3): Koerzion, No, Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read dwds21 lemma list
dwds <- read_delim("dwds21_verb_lemma_list", delim = "\t", 
                   quote = "",
                   col_names = c("Freq", "Lemma"))
## Rows: 14476 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Lemma
## dbl (1): Freq
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get frequency list
d_tbl <- d$X1 %>% table %>% as_tibble() %>% setNames(c("Lemma", "Freq_kx")) %>% arrange(desc(Freq_kx))
d_tbl <- left_join(d_tbl, dwds)
## Joining with `by = join_by(Lemma)`
d_tbl <- replace_na(d_tbl, list(Freq = 0, Freq_kx = 0))
# d_tbl <- d_tbl[,c(1,3,2)]

# remove instances where corpus frequency < construction frequency
d_tbl <- d_tbl[!(d_tbl$Freq < d_tbl$Freq_kx),]

# collexeme analysis with G2
collex_RBK <- collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), delta.p = T) 

# add further association measures
collex_RBK <- left_join(collex_RBK,
          select(collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), am = "odds"), COLLEX, COLL.STR.ODDS)) 
## Joining with `by = join_by(COLLEX)`
# collex_RBK %>% round_this() %>% write.xlsx("simple_collexeme_analysis_reflexive_Bewegungskonstruktion.xlsx")

collex_RBK %>% DT::datatable()
# rename
rbk <- d

Covarying collexeme analysis: Reflexive motion construction

# distinctive collexeme analysis: verb + preposition
RM_covar <- d %>% select(X1, Prep) %>% as.data.frame %>% collex.covar(raw = TRUE)

# write.xlsx(round_this(RM_covar), "RM_covar.xlsx")

RM_covar %>% DT::datatable()

Simple collexeme analysis: Reflexive particle-verb construction

# reflexive Partikelverbkonstruktion ----------
d <- read_csv("RPVKxn_true_positives.csv")
## Rows: 850 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): X1, LexikalischerFrame, KEE, KE2, Lesart, Schwierigkeit, Genre, Bi...
## dbl  (3): Koerzion, No, Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get frequency list
d_tbl <- d$X1 %>% table %>% as_tibble() %>% setNames(c("Lemma", "Freq_kx")) %>% arrange(desc(Freq_kx))
d_tbl <- left_join(d_tbl, dwds)
## Joining with `by = join_by(Lemma)`
d_tbl <- replace_na(d_tbl, list(Freq = 0, Freq_kx = 0))
# d_tbl <- d_tbl[,c(1,3,2)]

# remove instances where corpus frequency < construction frequency
d_tbl <- d_tbl[!(d_tbl$Freq < d_tbl$Freq_kx),]

# collexeme analysis with G2
collex_RPV <- collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), delta.p = T) 

# add further association measures
collex_RPV <- left_join(collex_RPV,
          select(collex(as.data.frame(d_tbl), corpsize = sum(dwds$Freq), am = "odds"), COLLEX, COLL.STR.ODDS))
## Joining with `by = join_by(COLLEX)`
# collex_RPV %>% round_this %>% write.xlsx("simple_collexeme_analysis_reflexive_Partikelverbkonstruktion.xlsx")

collex_RPV %>% DT::datatable()
#%>% write.xlsx("simple_collexeme_analysis_reflexive_Partikelverbkonstruktion.xlsx")

Distinctive collexeme analysis

# distinctive collexeme analysis targeting frames

frames_dist_collex <- left_join(
  rbk %>% select(LexikalischerFrame) %>% table %>% as.data.frame %>% setNames(c("Frame", "Freq_RM")),
  d %>% select(LexikalischerFrame) %>% table %>% as.data.frame %>% setNames(c("Frame", "Freq_RPV"))
) %>% 
  # replace NAs by 0
  replace_na(list(Freq_RM = 0, Freq_RPV = 0)) %>%
  # distinctive collexeme analysis
  collex.dist() 
## Joining with `by = join_by(Frame)`
# add number of lexeme types for each frame
# in the construction with which it is associated

frames_dist_collex$n_types <- numeric(nrow(frames_dist_collex))

for(i in 1:nrow(frames_dist_collex)) {
  if(as.character(frames_dist_collex[i,]$ASSOC)=="Freq_RM") {
 frames_dist_collex$n_types[i] <- rbk[which(rbk$LexikalischerFrame==as.character(frames_dist_collex[i,]$COLLEX)),]$X1 %>% unique %>% length

} else {
  frames_dist_collex$n_types[i] <- d[which(d$LexikalischerFrame==as.character(frames_dist_collex[i,]$COLLEX)),]$X1 %>% unique %>% length
}

}

#frames_dist_collex %>% round_this() %>% write.xlsx("distinctive_collexmes_frames.xlsx")



frames_dist_collex %>% DT::datatable()