Preliminaries

session info:

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.29   R6_2.5.1        jsonlite_1.8.0  magrittr_2.0.3 
##  [5] evaluate_0.15   stringi_1.7.8   rlang_1.0.6     cli_3.4.1      
##  [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1     rmarkdown_2.14 
## [13] tools_4.2.0     stringr_1.4.1   xfun_0.31       yaml_2.3.5     
## [17] fastmap_1.1.0   compiler_4.2.0  htmltools_0.5.2 knitr_1.39     
## [21] sass_0.4.1

Install and load packages

# install CRAN packages (if not yet installed)
sapply(c("data.table", "tidyverse", "devtools", "readxl", "kableExtra", "ngram", "networkD3", "igraph", "network", "patchwork", "koRpus", "pbapply", "tidytext", "cluster", "ggrepel", "animation", "kableExtra", "DT", "vroom", "Rtsne"), function(x) if(!is.element(x, installed.packages())) install.packages(x, dependencies = T, repos = "http://cran.us.r-project.org"))
# install non-CRAN packages (if not yet installed)
if(!is.element("concordances", installed.packages())) {
devtools::install_github("hartmast/concordances")
}
# install non-CRAN packages (if not yet installed)
if(!is.element("wizard", installed.packages())) {
devtools::install_github("hartmast/wizard")
}
# if this doesn't work, check sfla.ch for the package
if(!is.element("collostructions", installed.packages())) {
  install.packages("https://sfla.ch/wp-content/uploads/2021/02/collostructions_0.2.0.tar.gz", repos = NULL)
}


# install "wordVectors" if not yet installed
if(!is.element("wordVectors", installed.packages())) {
devtools::install_github("bmschmidt/wordVectors")
}


# load packages
library(readxl)
library(tidyverse)
library(ngram)
library(networkD3)
library(igraph)
library(network)
library(patchwork)
library(koRpus)
library(pbapply)
library(tidytext)
library(cluster)
library(ggrepel)
library(animation)
library(kableExtra)
library(DT)
library(collostructions) # available at sflach.ch
library(concordances) #available at github.com/hartmast/concordances
library(wizard) # available at github.com/hartmast/wizard
library(vroom)
library(wordVectors)

ENCOW

Helper functions

The following commands define a few helper functions that will be used in the following steps:

# logarithmize and return 0 instead of Inf if x==0
log0 <- function(x) {
  x <- ifelse(x == 0, 0, log(x))
  return(x)
}
# function for "prettyfying" df output
# inspired by https://github.com/rmcelreath/rethinking/blob/d0978c7f8b6329b94efa2014658d750ae12b1fa2/R/utilities.r
pretty_df <- function(df) {
  
  # function for rounding
  round_this <- function(x, digits = 2) ifelse(x < 1, signif(x, digits = digits), round(x, digits = 2))
  
  # function for getting prettyfied dataframe
  df_pretty <- as.data.frame(lapply(1:length(df), 
                       function(i) if(!class(df[[i]]) %in% c("character", "factor"))
                       {
                         round_this(df[[i]])
                       } else {
                         return(df[[i]])
                       })
  )
  
  # set names to original names
  colnames(df_pretty) <- colnames(df)
  return(df_pretty)
  
  
}
# search for entire words
grepw <- function(pattern, x, perl = F, ...) {
  grep(paste("^", pattern, "$", sep="", collapse=""), x, perl = perl, ...)
}

Read in data

d <- read_xlsx("../data/ENCOW_x_is_the_new_y_without_false_hits.xlsx")

Data wrangling

We exclude false hits, and we semi-automatically identify the heads of compounds and phrases. (In the data, the x and y elements have been lemmatized manually; wherever an element consists of a multi-word phrase and the head is not the rightmost element, the head has been highlighted via UPPERCASE; the function below uses this markup to identify the heads.)

# exclude false hits ------------------------------------------------------
d <- filter(d, keep == "y")
# add wordcount for x and y lemmas ----------------------------------------
d$wordcount_x <- sapply(1:nrow(d), function(i) wordcount(trimws(d$Lemma_x[i])))
d$wordcount_y <- sapply(1:nrow(d), function(i) wordcount(trimws(d$Lemma_y[i])))
# get heads of compounds and phrases --------------------------------------
# find instances in which there are words
# written entirely in uppercase (= our way of
# marking heads in the data, unless in the case of
# right-hand heads)
# empty columns for heads
d$head_x <- NA; d$head_y <- NA
# add wordcount for x and y lemmas 
d$wordcount_x <- sapply(1:nrow(d), function(i) wordcount(trimws(d$Lemma_x[i])))
d$wordcount_y <- sapply(1:nrow(d), function(i) wordcount(trimws(d$Lemma_y[i])))
# get heads
for(i in 1:nrow(d)) {
  
  if(d$wordcount_x[i]>1) {
    if(d$pos_x[i]!="NE" & grepl("[A-Z]{2,}", d$Lemma_x[i])) {
      d$head_x[i] <- tolower(unlist(strsplit(d$Lemma_x[i], " "))[grepl("[A-Z]{2,}", unlist(strsplit(d$Lemma_x[i], " ")))][1])
    } else{
      temp <- unlist(strsplit(d$Lemma_x[i], " "))
      d$head_x[i] <- tolower(temp[length(temp)])
    }
  } else {
    d$head_x[i] <- tolower(d$Lemma_x[i])
  }
  
  
  if(d$wordcount_y[i]>1) {
    if(d$pos_y[i]!="NE" & grepl("[A-Z]{2,}", d$Lemma_y[i])) {
      d$head_y[i] <- tolower(unlist(strsplit(d$Lemma_y[i], " "))[grepl("[A-Z]{2,}", unlist(strsplit(d$Lemma_y[i], " ")))][1])
    } else{
      temp <- unlist(strsplit(d$Lemma_y[i], " "))
      d$head_y[i] <- tolower(temp[length(temp)])
    }
  } else {
    d$head_y[i] <- tolower(d$Lemma_y[i])
  }
  
  
}
# remove all with "unclear" -----------------------------------------------
# backup copy for subsequent analysis
d_backup <- d
d <- d[-which(d$concept_x=="unclear" | d$concept_y=="unclear"),]

Types, tokens, hapax legomena

# get hapaxes:
tibble(
  types_x = length(unique(d$head_x)),
  types_y = length(unique(d$head_y)),
  types   = length(unique(paste0(d$head_x, "/", d$head_y))),
  hapaxes_x = table(d$head_x) %>% as_tibble(.name_repair = "unique") %>% setNames(c("lemma_x", "n")) %>% filter(n == 1) %>% nrow(),
hapaxes_y = table(d$head_y) %>% as_tibble(.name_repair = "unique") %>% setNames(c("lemma_y", "n")) %>% filter(n == 1) %>% nrow(),
hapaxes_all = paste0(d$head_x, "/", d$head_y) %>% table %>% as_tibble() %>% setNames(c("lemma", "n")) %>% filter(n == 1) %>% nrow,
  tokens = nrow(d)
) %>% kbl()

## New names:
## New names:
## • `` -> `...1`

types_x	types_y	types	hapaxes_x	hapaxes_y	hapaxes_all	tokens
2000	1651	2805	1505	1241	2479	3848

Explore concepts

The data have been annotated for the concepts of the x and y elements. We use heatmaps to explore the co-occurrence of different concpt categories.

# network ----------------------------------------------------------------
d$concept_x <- factor(d$concept_x); d$concept_y <- factor(d$concept_y)
tbl <- d %>% select(concept_x, concept_y) %>% table %>% as.data.frame
tbl$number_x <- as.numeric(factor(tbl$concept_x))
tbl$number_y <- as.numeric(factor(tbl$concept_y))
# add a column in which the frequency is 0 if
# concept_x == concept_y
tbl$Freq_noself <- ifelse(tbl$concept_x == tbl$concept_y, NA, tbl$Freq)
# sort factors by frequency in concept_x ----------------------------------
conc_by_freq <- d$concept_x %>% table %>% sort(decreasing = T) %>% rownames()
tbl$concept_x <- factor(tbl$concept_x, levels = conc_by_freq)
tbl$concept_y <- factor(tbl$concept_y, levels = conc_by_freq)
# heatmaps ----------------------------------------------------------------
tbl %>% ggplot(aes(x = concept_x, y = concept_y, fill = log0(Freq))) +
  geom_tile() + scale_fill_gradient(low = "yellow", high = "darkred") +
  theme(axis.text.x = element_text(angle=45, hjust=.9)) +
  guides(fill = guide_legend(title = "LogFreq"))

( p1 <- tbl %>% filter(Freq > 0) %>% ggplot(aes(x = concept_x, y = concept_y, fill = log0(Freq), label = Freq)) +
  geom_tile() + scale_fill_gradient(low = "yellow", high = "darkred") +
    guides(fill = guide_legend(title = "LogFreq")) + theme_classic() +
  theme(axis.text.x = element_text(angle=45, hjust=.9)) +
  geom_text(col = ifelse(log(filter(tbl, Freq > 0)$Freq > 6), "black", "white"), size = 4) +
    theme(axis.text = element_text(size = 18)) +
    theme(axis.title = element_text(size = 18)) +
    theme(strip.text = element_text(size = 18)) +
    theme(legend.text = element_text(size = 18)) +
    theme(legend.title = element_text(size = 18, face = "bold")) +
    theme(text = element_text(size = 18))
    ) + guides(fill = 'none')

ggsave("heatmap_x_is_the_new_y.png", height = 5, width = 6) 
 
( p2 <- tbl %>% filter(Freq > 0) %>% 
  ggplot(aes(x = concept_x, y = concept_y, fill = log0(Freq_noself), label = Freq_noself)) +
  geom_tile() + scale_fill_gradient(low = "yellow", high = "darkred") +
  guides(fill = guide_legend(title = "LogFreq")) + theme_classic() +
  theme(axis.text.x = element_text(angle=45, hjust=.9)) +
  geom_text(col = ifelse(log(filter(tbl, Freq > 0)$Freq_noself > 6), "black", "white"), size = 4) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) + guides(fill = "none") )

Collostructional analysis

For performing the collostructional analysis, we draw on a manual lemmatization of the items in the x and y slot. We first read in the lemmatization table.

# export heads for lemmatization ------------------------------------------
c(d %>% filter(!(concept_x=="person" & pos_x=="NE")) %>% select(head_x),
  d %>% filter(!(concept_y=="person" & pos_y=="NE")) %>% select(head_y)) %>%
  unlist %>% unique %>% as.data.frame 
#%>% write_excel_csv("lemmatization.csv")

# re-import lemmatized lists ----------------------------------------------
l <- read_csv("../data/lemmatization.csv")

Now we generate frequency lists with the help of the original dataframe and the lemmatization table.

# get frequency of x & y lemmas -----------------------------------------------
lx <- left_join(tibble(word = d$head_x),
          l) %>% na.omit %>% select(lemma) %>% table %>% as.data.frame(stringsAsFactors = F) %>% setNames(c("lemma", "Freq"))
ly <- left_join(tibble(word = d$head_y),
                l) %>% na.omit %>% select(lemma) %>% table %>% as.data.frame(stringsAsFactors = F) %>% setNames(c("lemma", "Freq"))

We additionally read in the ENCOW frequency list (available at https://www.webcorpora.org/opendata/, only relevant subset used here) to get the corpus frequencies of the lemmas in question.

encow <- read_csv("../data/x_is_the_new_y_encow_frequencies.csv")

## Rows: 4457 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (1): Freq_encow
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Using this dataset, we add the corpus frequencies to the frequency tables created before.

# add frequencies -----------------------------------------------
lx$Freq_encow <- sapply(1:nrow(lx), function(i) sum(encow[grepw(lx$lemma[i], encow$word),]$Freq_encow))
ly$Freq_encow <- sapply(1:nrow(ly), function(i) sum(encow[grepw(ly$lemma[i], encow$word),]$Freq_encow))

Given that we draw on a manually lemmatized dataset, while the ENCOW frequency table is based on automatically tagged data, the match is not perfect. Thus, there are a few lemmas that are not attested at all in the ENCOW lemma list. As this only affects a few lemmas, they are discarded in the next step.

# omit all that are not attested in ENCOW
lx <- filter(lx, Freq_encow > 0)
ly <- filter(ly, Freq_encow > 0)

This dataset can now be used as input for simple collexeme analysis.

# get total frequency of all nouns and adjectives
# in the ENCOW corpus
#encow <- vroom("/Users/stefanhartmann/Downloads/encow16ax.lp.tsv.gz", delim = "\t", quote = "", col_names = c("lemma", "POS", "Freq"))

# filter(encow, POS %in% c("NN", "NE", "JJ", "JJR", "JJS")) %>% select(Freq) %>% sum # result: 1967348249


left_join(collex(lx, corpsize = 1967348249, delta.p = T),
select(collex(lx, corpsize = 1967348249, am = "odds"), COLLEX, COLL.STR.ODDS)) %>% datatable() %>% formatSignif(columns = c("OBS", "EXP", "COLL.STR.LOGL"), digits=5)

## Joining, by = "COLLEX"

left_join(collex(ly, corpsize = 1967348249, delta.p = T),
select(collex(ly, corpsize = 1967348249, am = "odds"), COLLEX, COLL.STR.ODDS)) %>% datatable() %>% formatSignif(columns = c("OBS", "EXP", "COLL.STR.LOGL"), digits=5)

## Joining, by = "COLLEX"

# collex.dist(lx) %>% datatable() %>% formatSignif(columns = c("E.CXN1", "E.CXN2", "COLL.STR.LOGL"), digits=3)
# collex.dist(ly) %>% datatable() %>% formatSignif(columns = c("E.CXN1", "E.CXN2", "COLL.STR.LOGL"), digits=3)

In addition, we use a “tupleized” (Gries 2019) collostructional analysis approach in which we use an association measure that is not sensitive to sample size (odds ratio) and plot it against the logarithmized frequency of the individual lexemes.

p1 <- collex(lx, corpsize = 1805183579, am = "odds") %>% ggplot(aes(x = log1p(OBS), y = log1p(COLL.STR.ODDS), label = COLLEX, col = log1p(OBS))) + geom_text() + theme_bw() + xlab("Log odds ratio") + ylab("Log Frequency") + scale_color_continuous(low = rgb(0,.7,1,.4), high = "black") + guides(col = 'none') + ggtitle("X") + theme(plot.title = element_text(face = "bold", hjust = 0.5))
 
p2 <- collex(ly, corpsize = 1805183579, am = "odds") %>% ggplot(aes(x = log1p(OBS), y = log1p(COLL.STR.ODDS), label = COLLEX, col = log1p(OBS))) + geom_text() + theme_bw() + xlab("Log odds ratio") + ylab("Log Frequency") + scale_color_continuous(low = rgb(0,.7,1,.4), high = "black") + guides(col = 'none') + ggtitle("Y") + theme(plot.title = element_text(face = "bold", hjust = 0.5))


p1 | p2

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning: Removed 6 rows containing missing values (geom_text).

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning: Removed 4 rows containing missing values (geom_text).

ggsave("collex_xnewy_encow.png", height = 7, width = 13)

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning: Removed 6 rows containing missing values (geom_text).

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning in log1p(COLL.STR.ODDS): NaNs produced

## Warning: Removed 4 rows containing missing values (geom_text).

Covarying collexeme analysis

d %>% select(head_x, head_y) %>% as.data.frame %>% collex.covar %>% pretty_df() %>% datatable()

Semantic vector-space analysis: Word2vec

To assess the semantics of the slot fillers in more detail, we use distributional semantics. More specifically, we draw on word2vec. Word2vec is originally the name of a software comprising two main algorithms for representing words in terms of dense vectors, but the term has become virtually synonymous with the approach itself. We use the R package wordVectors, which builds on the original word2vec code. We use the skip-gram approach, which is generally considered to work well with a small amount of training data (see e.g. this tutorial) than the alternative continuous-bag-of-words (cbow) approach. The model was trained based on the first of the 17 downloadable sentence shuffles of ENCOW using the following code:

# read data & export text files without annotation
# Note: the downloadable file containing the first sentence shuffle
# of ENCOW has been split in c. 100 parts, named xaa, xab etc. etc.,
# hence the pattern in the list.file command below. The algorithm
# produces a txt file containing only the words, without annotations
f <- list.files("/Volumes/INTENSO/Corpora/ENCOW/", pattern = "^x..", full.names = T) 

for(i in 1:length(f)) {
  d <- vroom_lines(f[i])
  d <- gsub("^<.*|\t.*", "", d)
  d <- d[d!=""]
  vroom_write_lines(d, paste0("/Volumes/INTENSO/Corpora/ENCOW/encow_word2vec_training/encow16b_words_for_training001", i, ".txt"))
  print(i)
}

# Next, the prep_word2vec command from the WordVectors package is used
# to prepare the training file, containing just the words in lowercase,
# without punctuation. The model is then trained on the basis of this
# file and exported.
prep_word2vec(origin="/Volumes/INTENSO/Corpora/ENCOW/encow_word2vec_training/words", destination="/Volumes/INTENSO/Corpora/ENCOW/encowa_w2v_words.txt", lowercase=T,bundle_ngrams=1)

# train model:
model <- train_word2vec("/Volumes/INTENSO/Corpora/ENCOW/encowa_w2v_words.txt", output_file  = "/Volumes/INTENSO/Corpora/ENCOW/encow_vectors_word_based.bin",vectors=100,threads=4,window=5,iter=3,negative_samples=5)

# export:
write_rds(model, "model.Rds")

This model is then used to visualize the semantic space occupied by the slot fillers in [X is the new Y]:

# read model
model <- readRDS("/Users/stefanhartmann/sciebo/Projekte/snowclones/word2vec/model.Rds")

We first add a column with the manually corrected lemmas to the dataframe.

# add lemmatization to head -----------------------------------------------
d <- left_join(d, rename(l, c(lemma_head_x = lemma)), by = c("head_x" = "word"), all.x = T)
d <- left_join(d, rename(l, c(lemma_head_y = lemma)), by = c("head_y" = "word"), all.x = T)
# fill NA head columns
d$lemma_head_x <- ifelse(is.na(d$lemma_head_x), d$head_x, d$lemma_head_x)
d$lemma_head_y <- ifelse(is.na(d$lemma_head_y), d$head_y, d$lemma_head_y)

Based on these data, we visualize the semantic “space” occupied by the x and y slot fillers. To do so, we first compile a frequency table: how often do the individual slot fillers occur in the x and y slots, respectively?

d_tbl <- c(d$lemma_head_x, d$lemma_head_y) %>% unique
freq_x <- sapply(1:length(d_tbl), function(i) length(which(d$lemma_head_x == d_tbl[i])))
freq_y <- sapply(1:length(d_tbl), function(i) length(which(d$lemma_head_y == d_tbl[i])))
d_tbl <- tibble(lemma = d_tbl,
                freq_x = freq_x,
                freq_y = freq_y)

# add total frequency and relative frequency in x slot
d_tbl$total <- d_tbl$freq_x + d_tbl$freq_y
d_tbl$rel   <- d_tbl$freq_x / d_tbl$total


# get cosine distance between all slot fillers
cosine_dists <- cosineDist(model[[d_tbl$lemma, average = FALSE]], model[[d_tbl$lemma, average = FALSE]])


# add cosine distance between x and y slot filler
# to dataframe d

d$distance_x_y <- sapply(1:nrow(d), function(i) ifelse(d$lemma_head_x[i] %in% rownames(cosine_dists) & d$lemma_head_y[i] %in% colnames(cosine_dists), cosine_dists[which(rownames(cosine_dists)==d$lemma_head_x[i]), which(colnames(cosine_dists)==d$lemma_head_y[i])], NA))

# plot the semantic space

# Multi-Dimensional Scaling
cosine_mds_matrix <- cosine_dists %>% cmdscale()
cosine_mds <- cosine_mds_matrix %>% as.data.frame %>% rownames_to_column()
colnames(cosine_mds)[1] <- "lemma"


# alternative dimensionality reduction technique: t-SNE
cosine_rtsne <- Rtsne::Rtsne(cosine_mds_matrix)

# add TSNE to cosine_mds
cosine_mds <- cbind(cosine_mds, setNames(as.data.frame(cosine_rtsne$Y), c("dim1", "dim2")))


# filter: only items occurring at least 10 times
d_tbl <- left_join(d_tbl, cosine_mds) %>% na.omit() %>% filter(total>=10) %>% filter(lemma != "x")

## Joining, by = "lemma"

d_tbl <- rbind(d_tbl, left_join(d_tbl, cosine_mds)[2,])

## Joining, by = c("lemma", "V1", "V2", "dim1", "dim2")

# add an additional row whose only function it is to
# increase the font size of the remaining items (because
# its frequency is lower than that of the item with the smallest
# total frequency)

# MDS results:
set.seed(1985)
rbind(d_tbl, tibble(lemma = "", freq_x = 0, freq_y = 0, total = 5, rel = 0, V1 = 0, V2 = 0, dim1 = 0, dim2 = 0)) %>% ggplot(aes(x = V1, y = V2, size = log10(total), col = rel, label = lemma)) + geom_text_repel(max.overlaps = 21) + guides(col = 'none', size = 'none') + scale_color_continuous(low = "blue", high = "red") + theme_bw() + xlab("dim1") + ylab("dim2")

#ggsave("distsem_xnewy_w2v.png", width = 6.5, height = 6, dpi = 500)

# t-SNE results:
set.seed(1985)
rbind(d_tbl, tibble(lemma = "", freq_x = 0, freq_y = 0, total = 5, rel = 0, V1 = 0, V2 = 0, dim1 = 0, dim2 = 0)) %>% ggplot(aes(x = dim1, y = dim2, size = log10(total), col = rel, label = lemma)) + geom_text_repel(max.overlaps = 21) + guides(col = 'none', size = 'none') + scale_color_continuous(low = "blue", high = "red") + theme_bw() + xlab("dim1") + ylab("dim2")

# ggsave("distsem_xnewy_w2v_tsne.png", width = 6.5, height = 6, dpi = 500)

# MDS results without the tweaked font size:
set.seed(1985)
d_tbl %>% ggplot(aes(x = V1, y = V2, size = log10(total), col = rel, label = lemma)) + geom_text_repel(max.overlaps = 17) + guides(col = 'none', size = 'none') + scale_color_continuous(low = "blue", high = "red") + theme_bw() + xlab("dim1") + ylab("dim2")

# ggsave("distsem_xnewy_w2v.png", width = 6, height = 6, dpi = 500)

# in black&white:
set.seed(1985)
d_tbl %>% ggplot(aes(x = V1, y = V2, size = log10(total), col = rel, label = lemma)) + geom_text_repel(max.overlaps = 17) + guides(col = 'none', size = 'none') + scale_color_continuous(low = "grey40", high = "grey80") + theme_bw() + xlab("dim1") + ylab("dim2")

# ggsave("distsem_xnewy_w2v_bw.png", width = 8, height = 8)

Visualizing semantic distance between X and Y

distances <- d %>% filter(!is.na(distance_x_y)) %>% arrange(desc(distance_x_y)) %>% 
  select(lemma_head_x, lemma_head_y, distance_x_y) %>% na.omit %>% 
  unique

distances %>% ggplot(aes(x = distance_x_y)) + geom_histogram(binwidth = 0.03, col ="black", fill = "grey50") + theme_classic()

# distances %>% ggplot(aes(x = distance_x_y)) + geom_histogram(aes(y = ..density..), binwidth = 0.03, col = "black", fill = "grey50") + geom_density(col = rgb(1,.2,.4,.7), lwd = 1.5) + theme_classic() + #ylab("Count") + 
#   xlab("Cosine distance")
# ggsave("cosine_distance_hist_w2v.png")

Selected items

find_items <- function(x, y) {
  return(distances[which(distances$lemma_head_x == x & distances$lemma_head_y == y),])
}

dist_examples <- rbind(find_items("anxiety", "depression"),
      find_items("female", "male"),
      find_items("democrat", "republican"),
      find_items("abnormality", "disease"),
      find_items("alpha", "beta"),
      find_items("audio", "video"),
      find_items("pear", "raspberry"),
      find_items("sushi", "pizza"),
      find_items("small", "large"),
      find_items("environmentalist", "socialist"),
      find_items("sugar", "nicotine"),
      find_items("computer", "radio"),
      find_items("publishing", "literacy"),
      find_items("paper", "confidentiality"),
      find_items("mean", "green"),
      find_items("sustainable", "black"),
      find_items("ethics", "green"),
      find_items("funds", "black")
      ) %>% mutate(lemmas = factor(paste0(lemma_head_x, " - ", lemma_head_y), levels = paste0(lemma_head_x, " - ", lemma_head_y)))

dist_examples <- dist_examples %>% arrange(distance_x_y)
dist_examples$lemmas <- factor(dist_examples$lemmas, levels = dist_examples$lemmas)
dist_examples %>% ggplot(aes(x = distance_x_y, y = lemmas)) + geom_col(fill = "black") + theme_bw() + ylab("Lemmas") + xlab("Cosine distance")

# ggsave("xnewy_examples_distance_w2v.png")

Semantic vector-space analysis - manual

Before using word2vec, we used a manual semantic vector-space approach following Levshina (2015). For the sake of completeness, we document it here in addition to the word2vec approach.

A list of collocates for the lexical items in the x and y slots was compiled using the downloadable portion of the ENCOW corpus. More specifically, the data were retrieved as follows:

A lemma list was exported from the concordance of “X is the new Y” ENCOWA hits after manually deleting all false hits.
The downloadable ENCOWBX files were queried using CWB. For each query item, a maximum of 10,000 hits was taken into account. More precisely,

a frequency list of all c. 2,000 lemmas was compiled in the first of 15 batches the ENCOWBX corpus is comprised of, using the following queries:

cwb-scan-corpus -o freqs1.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/90s|degree|hayes|supervisor|aa\+|democrat|horse|pregnancy|amis|transparency|plumbing|retention|virtual|spd|apple|torres|content|pr|recommendation|selfie|service|functionality|link|pakistan|arab-american|assertiveness|tax|university|hotel|arsenal|hoover|hamas|pension|cooking|junkie|fever|wells|internet|max|zombie|magnets|hunter|china|idealism|myth|environmentalist|portugal|swansea|mouth|sarcasm|doll-art|catchpole|page/brin|fox|mcgregor|seigel|hope|hangover|kosteniuk|courtney|utah|campaign|anti-terrorism|gathering|anti-americanism|anti-christianity|anti-zionism|government|islamophobia|antizionism|williams|facebook|experience|angry|console|glass|medicine|app|htc|engineering|jacket|gap|brazil|latvia|entrepreneur|ceos|einstein|virginia|alaska|allagents|cotton|spur|cinema|games|politics|science|mark|crow|curator|blogger|dairy|meat|mould|arab|okra|america|flabby|conversation|bolshevism|september|bronte|sleep|c|c-word|culture|masters|relationship|sharing|forums|bourbon|burrito|neck|peeps|pumpkin|sriracha|stress|liberalism|silvstedt|men|slipper|roll|wild|american|developer|terrorist|zionist|shoe|poncho|grosjean|python|computing|nightwing|simpson|rpi|sledding|beardism|smiths|normal|ugly|sov|kitchen|rabbit|black|turk|gonzales|beijing|detroit|hitchens|alpha|dvcs|rhianna|phone|crotch|small|hijab|trillion|trinity|observation|eisfeld|atoms|dull|clothing|accountability|advertising|airsoft|algorithm|aloeminium|ambiguity|analytics|anti-?semitism|atheism|audio|austerity|average|jacob|balsemic|barter|basic|jack|bearish|bearskin|beige|belgian|belgium|bicycles|bigomy|8-?bit|64-?bit|bitch|bitch-?shaming|bitchy|bleak|blog|blogging|blonde|blue|bold|brindle|brown|budget|budgeting|burlesque|bustles|busy|butcher|camo|canada|cannibalism|carbon|cars|ceph|chain|characters|chav|cheating|cheerios|chemistry|chickens|children|chiropractic|choir|christian|cloud|coachbuilt|acceptance|color|colors|commission|confidence|cork|corpse|coworking|crap|creative|creativity|crying|curation|dancing|dark|data|day|deficit|denim|depression|dialogue|display|dot|pink|drag|drama|elvis|green|epic|ereaders|ethical|everything|evidence|evil|exo-?suits|eyeliner|units|factor|failure|getting|fat|fear|flat|mac|fluffy|folk|footwear|free|friday|frugal|funds|fur|fusion|garden|gay|geek|ghost|ginger|back|gluten-?free|gold|google\+|goth|gray|greed|grey|growth|guts|hair|hash|hat|healthy|history|horses|house|hunting|hybridi[sz]ation|hyperspeciali[zs]ation|hypocrisy|ignorance|imperfection|indigo|innovation|intelligence|investing|irc|being|jail|javascript|jesus|k|cyberpunk|khaki|birth|knowledge|korean|layout|learning|liberation|libertarian|lime|live|malt|marketing|mashup|masks|matter|mauve|shopping|meantime|measurement|reaching|media|meeting|microcontent|mobile|modesty|monstrosity|mpg|muslim|navy|nerd|network|neurotic|niche|noir|non-?tribal/cd"
cwb-scan-corpus -o freqs2.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/non-?tribalism|noticeable|nude|obama|objectivity|obscurity|off-?topic|openness|orange|oxymoron|paisley|palestinian|paper|perception|persistence|jones|phishing|piracy|pixie|plus|podcasting|poetry|point|poly|poo|postmodernism|poverty|pregnant|prints|privacy|product|progressive|extroversion|publishing|purple|pvc|quirky|racism|rape|rare|red|diamond|redesign|regulation|republican|research|reserve|retro|maleficientevil|robot|romanticism|room|rye|over-?50s|saleflat|saving|bitching|shop-?dropping|silence|silver|singledom|skepticism|slip|slow|smart|snow|snowclone|sociability|software|soup|spacebook|square|ss|state|stealth|stinky|stock|street|stripes|stupidity|sustainable|sweden|system|tags|tattoo|taupe|driving|tea|technology|uniformity|thrift|thriftiness|tinfoil|toast|transformation|trust|truth|truthy-?satire|tumblr|hate|twitter-?hate|typography|ukulele|underwear|ux|vamp|vertigo|video|violet|vodka|voice|vulnerability|wave|wheelchair|white|wine|wireless|women|yellow|zeldman\.com|artist|os|outlet|yellowface|mexican|pedophile|cameron|body|cast|vid|self-?publishing|speaking|opml|networks|bald|brunette|driver|football|chop|mccain|valentine|laker|information|strong|soaps|islamism|everton|hyde|pride|icon|web|blook|google|city|gibson|stick|girlfriend|girl|c-?ptsd|attorney|cowardice|spain|stamp|flying|implant|chas|warwickshire|burien|canvas|rainbow|cauliflower|ostrich|stroke|boutique|iphone|star|wristband|toque|whedon-harris|book|comment|brewer|b|b\+|d|theory|glamping|quays|dementia|balotelli|wall|coach|neo-?liberalism|talentism|computer|biodiversity|brand|hunk|jericoa|barbel|mickle|albom|view-?ticker|company|clooney|bond|fabianski|3d|datacenter|chavez|complex|library|museum|twihards|boldness|marcum|player|civility|left|retriever|uncertainty|demoing|greens|sherry|cruise|angela|thatcher|kilcourse|customi[sz]ation|miami|goat|curry|newcastle|texas|vegas|objectification|chick|sex|homocysteine|xiaoming|anger|msft|stadium|supermarket|bush|access|soda|twitter|device|mallory|dvd|television|shareholder|periphery|weekly|movement|jazz|metal|rock|comics|pcism|trashy|strachan|diagram|cliches|touch|watt|aerosols|java|php|molly|ajax|card|cake|therapy|preschool|economy|militarisation|right|years|puzzles|papyrus|retweet|trackback|christ-?bots|crime|environmentalism|islam|socialism|terrorism|jv-?ists|animosity|promise|collaboration|hights|ryanair|grass|timber|wood|podosphere|non-?conformity|rebellion|moderate|unpredictability|bible|travel|community|eggs|brewing|craft|urls|coal|trademark|oil|slimness|nicaragua|panama|skegness|ted|responsibility|revolt|boro|pipe|denialism|social|regurgitation|baseball|darts|fairtrade|ham|show|doctor|gaming|nuts|debt|jews|codes|process|bacon|cookie|cupcake|macaron|pie|brownie|popcorn|vintage|cricket|biology|israel|manga|website|guardian|arizona|parker|feeling|carver|guy|past|photoshopping|metadata|baggs|goldacre|night|losses|retirement|months|frugality|building|numb|deja-?vu|stafford|download|beta|bellamy|shannon|maude|ariza|anxiety|breadth|chic|hand|browser|laptop|happiness|hall|wilshere|kate|class|print|conservative|whaley|murray|price|abnormality|memory|ram|disney|pixar|abuse|thief|dot-?gov|biotech|plant|sheik|wal-?mart|lukaku|copy|canopy|sober|bro|prison|tools|easy|norwood|papua|west|halloween|sociology|radio|anti-?feminism|thomas|peterballb|literacy|usa|economics|tweet|e-?mail|extremism|eminem|stevens|board|messaging|indie|participation|tape|australia|slang|manuel|self-?expression|derivatives|inequality|meritocracy|old|man|demers|luxury|journalists|chains|essex|ign|asia|europe|rachel|nestor|aoltv|good|ubiquity|blockbuster|mediocrity|brows|deleting|pinterest/cd"
cwb-scan-corpus -o freqs3a.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/antifacism|blur|salar|samarra|family|farming|friends|ron|drone|fishermen|anti-?fascism|republicanism|zionism|neocon|credit|food|smoothie|digital|smartphone|july|empowerfullment|murder|mclaren|belly|offal|videogame|elf|consideration|fission|youtube|kindergarten|third|dictator|ramsey|crippled|wisconsin|eurostar|raclette|soccer|casual|drive|six|bashing|estonia|monaco|who|sister|bill|haw|paid-for|dependency|banana|monday|sunday|thursday|wednesday|evening|mother|door|brewery|baby|humans|electro|awkward|instability|author|alper|design|film|h&m|skype|workspace|architecture|coffee|minecraft|curating|goy|immigrant|obese|people|bots|duck|judai[sz]er|frazer|time|british|iran|poland|irs|assistant|tsa|french|arteta|grant|tongue|boys|bluebell|anti-?glamour|manchester|festival|sprinkle|cooling|diaoyu|staying|art|attention|copper|energy|glue|land|water|cycling|diving|gardening|poker|shooting|surfing|warcraft|wow|impression|bad|mediocre|github|nike\.com|yahoo|humanism|emo|koreans|ninja|fantasy|artisanal|youths|megacorps|chaput|reshammiya|capello|statistics|academy|large|turkey|ethics|glow|mean|plustainability|pollution|resilience|sustainability|accessible|hud|pattern|rid|gentle|bagram|sword|bolter|job|dimitriades|prunty|carb|builder|jihad|bump|burslem|hour|fletch|cleverley|ralph|shop|twilight|fact|to-?truth|california|image|affluence|individualism|clarendon|sale|those|vicodin|seo|manhattanite|wheats|dollar|b\.a\.|ba|college|hairspray|online|robotic|tech|geography|ahmadinejad|assad|laden|bama|blair|shamir|ahmedinejad|putin|hussein|vlad|space|invasion|reform|bedroom|homeland-?referencing|place|hypermiling|hoppy|sour|cold|deadspot|sql|arrogance|shopper|afr|diesel|skrtel|shahzad|intel|microsoft|sap|landis|spinach|britain|ias|computationalism|pragmatism|realism|chrome|firefox|ipad|webkit|school|corporation|designer|financier|hacker|outbound|paddock|customer|care|sanity|outside|arm|backloading|explorer|nebraska|philosophy|extremist|english|earnest|church|honduras|japan|money|icloud|battles|thompson|sam|desmond|bourne|february|june|may|october|korea|flashplayer|shorts|pants|woodley|park|asian|believers|eurosceptic|hispanic|patriot|unemployed|war|incarceration|wod|stanley|lula|harmon|bachmann|taplin|brooks|wilson|mccallum|hagel|dunham|hanks|george|bangkok|elway|audience|assange/cd"
cwb-scan-corpus -o freqs3b.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/chipping|zanthura|bloke|comfort|chance|kagawa|collard|porn|aquilani|blake|dougal|keepin\'|barack|sauce|gesture|benghazi|quesarito|oligarch|cupid|asa|group|nra|limbaugh|voyeurism|knowing|bbq|afg-?pak|cyrenaica|kr|wroclaw|neuköln|national|bnp|bridgford|licenses|typewriter|dubai|macao|macau|br|blu-?ray|spotify|events|programming|tears|dunne|fun|iraq|derby|stoke|centre|freak|blackpool|work|keith|smoker|lutz|neoconservative|news|starbucks|camera|text-?ad|keeping|varnish|citizenship|code|numeracy|scripting|technologies|rag-?mags|charts|doncaster|draw|quiet|abreu|jams|brow|alec|corden|magazine|overgrowth|simplicity|inspiration|ipod|rove|aguilera|blog-?hubs|pc|minority|diarra|romeu|island|female|milan|shiregreen|messi|january|casta|kardashian|viewtiful|capaldi|engagement|geolocation|hr|journalism|pleasure|proof|storytelling|support|teaching|crocs|sto|ledger|dando|muscat|april|lewe|mfa|training|dean|romney|reid|mcleary|lofts|search|client|gigabyte|core|sponsorship|ribbon|aluminium|carel|beam|eisenberg|milliband|bianca|ne-yo|nando|merton|macro|g\.|goggle|nokia|android|indies|age|framework|flash|neuron|kawasaki|chelsea|polypropylene|vista|billion|sims|maximalism|neutrality|burkh?a|beaver|lynx|dead|music|debater|michigan|pharmaceutical|self-?promotion|thing|belfast|tuesday|programmeri[zs]ation|meaning|crustacean|taxpayer|lymington|lee|choice|less|london|pannu|stylus|owls|ben|macpherson|warming|distro|prs|male|atheist|mongodb|celebrity|fiction|sf|fabric|steve|kazaa|el-sisi|local|labour|regionalism|regionality|fake|roadside|russia|taliban|us|hizbollah|islamist|israeli|leader|neo-?con|serbs|bugaboo|ed|army|loser|metallic|d\.c\.|eve|florida|istanbul|new|oid|québec|post|liverpool|birdie|brother|interesting|farage|cpj|sugar|hamilton|eeevangelical|io|claire|greenwald|medal|samsung|abnormal|halfway|layne|dalston|balham|eating|scalia-alito|prescott|on-?topic|headphone|unofficial|gas|opacity|talent|kelly|ufelon|viggo|young|updosing|hernandez|pulido|offline|closing|india|cargo|democracy|sheen|opt-?out|bleakness|threadbare|peach|extraordinary|god|heresy|tent|evp|impact|crowdsourcing|jay|england|pale|primal|citizen|trump|volley|tv|tights|screen|e-?book|politician|austin|morocco|vienna|astronomy|interactivity|cyril|atm|un-?pc|tablet|moleskine|asparagus|oranges|disability|overload|imperfect|temporary|africa|ipods|ken|physicist|instagram|screenshot|cocoa|tart|aluminum|babe|lifestyle|nytol|queensryche|wire|miquel|viking|abbeydale|sushi|plan|train|stocking|experimentation|welfare|couch|tikkabilla|mice|svg|wrist|gamification|bingo|grits|prosecutor|banker|nn|documentary-?making|rugby|buyer|okay|psychiatrist|kale|lamb|skin|torturing|thought|violence|cookbook|negative|lawyer|sexy|story|distribution|filing|relevancy|riga|yoga|kreuzberg|linkedin|rfc|bike|boris|scientist|assessor|biologist|suri|public|education|prediction|loss|worker|leisure|fats|catholicism|eco-?therapy|freiburg|bombing|marketer|teen|butternut|bluegrass|start-?up|pre-?retirement|correctness|hysteria|vegetarianism|envirocultist|chevron|quantity|polyamory|offender|refn|loud|mode|religion|association|antiracism|classism|cynicism|disparity|feminism|homophobia|multiculturalism|denial|scepticism|transphobia|wealth|center|net|placement|glaad|capital|disco|creating|pear|bar|christie|francis|mata|palin|metaphysical|hypertext|metaphor|fishing|pocket|irrationalism|prudent|dread|lip|mustache|redcurrant|sparrow|stupid|servers|sheffield|demakes|steampunk|ringing|christ|portfolio|project|ascension|seed|poor|workweek|lambie|pacheco|down|wrong|davies|safe|highways|rage|officials|connery|damon|archaeology|bowl|business|chef|comedy|cyclocross|director|entrepreneurship|folk-?pop|heist|jockey|knitting|living|phonetic|poultry|presentation|rhetoric|sandal|self-?harm|skiffle|stockbroking|tradition|variety/cd"
cwb-scan-corpus -o freqs4.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/authenticity|autobiography|act|dj|feedback|hairdressing|influencer|killer|material|moderator|molestation|anything|pet|physics|rap|rapper|shanty|i\.t\.|gamer|programmer|production|joss|paul|boredom|constantinople|dc|washington|luiz|theo|heston|robertson|rulers|parkgate|microformat|rtf|pearce|chinese|collectors|weaver|4k|cool|operator|chicago|singapore|savings|crazy|eccentricity|madness|math|topicality|kurdistan|selling|list|networking|vettel|libertarianism|milo|e-?bikes|norway|josh|snyder|marketeer|hd|fdr|optimi[zs]ation|rss|bellevue|bury|hounslow|ridgewood|elite|treasure|touching|serving|juvenility|nfo|summer|weti|chocolate|housewife|self-?loathing|talk|growing|judgment|40|brainy|corny|dumb|helmet|homo|responsible|simple|skinny|statistician|strength|unflappable|gupta|caruso|midi|cross-?dressing|drinking|sewing|avalanche|shenzhen|iggy|windrow|chavs|fit|tighter|orbit|mezzoblue|usenet|socialist|abortion|capitalism|coercion|copyright|sick|awake|big|micro|tall|nme|drinker|obesity|sitting|suvs|email|effect|strategy|lib-?dems|hollywood|hardware|order|afghanistan|nobodie|everybody|something|barista|ortega|kim|north|eu|bosnia|ppc|vandalism|shouting|dystopia|stability|cutting|yates|hard|break|circle|knol|military|dubrovnik|husky|owner|tattooist|change|understanding|inversion|sinofksy|wikipedia|murphy|nesbo|stomping|go|bent|rhino|billionaire|cyberspace|p\=np|weak|desktop|student|charge|membership|kakuro|set|spring|saturday|mini|rationality|shortage|kindness|flu|sandwich|theft|cent(re|er)|smile|tarmo|hide|tail|disk|http|panic|turquioise|browsing|mail|secularism|reporter|writer|ghraib|chief|mooc|sort|recycling|tommie|technocracy|curvy|forties|kid|bottle|sock|datamoshing|hocus-?pocus|bali|hand?kerchief|tlr|alcohol|salt|yesterday|dogs|judging|kiriakou|shia|shitty|magarri|namor|party|maria|islamification|nihilism|affection|agents|legacy|smartcar|communication|pineapple|they|legging|langerado|fallacy|lies|borgia|bitcoin|salmon|barnsley|dakota|sicily|threesome|coldplay|moore|slackware|kitty|beauty|obrist|postgrad|dragon|side|kenya|ut|fictionalism|mermaid|angel|immortal|minotaur|witch|let|biloxi|edtech|jakeways|vertonghen|afghan|certificate|sandro|vhs|percussion|love|leopard|potplayer|whatwg|etc|towel|reporting|consensus|vatican-?gate|lumens|vagueness|datalink|words|apis|zimbabwe|east|mars|engine|manor|tequila|weed|beacon|lottie|westfield|digit|mashable|mongolia|sa|yemen|stewart|murdock|ios|x|beer|sneaker|cook|paedophile|kok|measle|wes|picture|professors|ufc|y|avenger|kinect|lily|coverage|sequestration|myspace|nyt|tkm|sox|decade|tomorrow|healing|parent|pensioner|pusher|levaquin|tuncay|ki-?duk|galloway|post-?apocalyptic|monster|80s|a-?levels|dewdney|aaa|abolitionist|isaac|abstin[ea]nce|hamza|accountancy|acquisition|actual|add|adobe|adriano|aesthetics|african-?americans|aggression|aid|airline|airlines|capone|qaeda|alchemy|alcoholic|aldomania|liddi|alexandria|alien|amazon|dream|amish|amsterdam|steel|analysis|anarchy|anderson|andreessen/bina|coulson|goram|rooney|angst|kournikova|smith|antarctica|anti-?communism|anti-?doping|antisemitism|margarito|aol|newton|application|appple|monkey|51|argentina|argonaut|aristocracy|aristotle|arkansas|arla|armor|devaney|scargill|graduate|tobacco|asbestos|at&t|athens|athletic|atlantis|attila|august|austen|b-?word|babylon|bachelor|bachelors|backlink|backyard|bagel|ling|bank|kopple|barbarian|barefoot|barrel|barrichello|weaving|batman|baucus|bbc|skiing|bear|beatles|beautiful|butthead|beef|berber|berger|berlin|levin|betamax|beyonce|bicycle|bifocals|bikini|binary|bischoff|bits|bizarre|panthers|blackberry|blackface|blackout|blacks|bling-?bling|blogroll|bmw|race|bob|dole|nichols|fett|bobcat|bollywood|bolshevik|bolton|bono|boobs|woogie|cover|borg|co|bowyer|box|boyfriend|bpd|brahmin|bravery|lines|breaking-?and-?entering|brian|brick|bristol|spears|brooklyn|brussel|buffalo|bullet|burberry|burqa|burton-depp|buying|c\.v\.|calculus|camping|wharf|cancer|cantona|car|footprint|boozer|s|dufay|carp|carpenter|nation|daly|cartel|cash|casillas|castle|castro|cathedral|morlands|caution|sabathia|censorship|ceramic|cerebus|certainty|trailer|champagne|channel|chardonnay|che|chekhov|chili|chivalry|cholesterol|yun-fat|christianity|norris|chuckecheese|churchill|cigarette|darling|citizenry|life|classical|classic|classy|ranieri|writing|cliche|click|stoll|climate|co2|cobol|cocaine|coke|colonialism|colonialist|colour|sans|section|commies|commissar|commodity|communism|communist|compassion|competition|compton|concordepan|concrete|conferencesphere|confidentiality|conformity|conservatories/cd"
cwb-scan-corpus -o freqs5a.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/consistency|constitution|consumerism|consumption|pill|cookery|syrup|corset|rica|sol|counterculture|court|coventry|crackberry|crate|creation|creationism|crisps|crossfire|crossroads?|crossword|croutons|crucifixtion|crusader|ctcss|curling|cv|czechoslovakia|da-?da|dailies|dallas|haren|queen|daniels|devito|mills|knight|darkroom|brent|dawkins|panels|death|decadence|decentralisation|deep|gomes|demo|demographics|demos|martyr|niro|sinor|howell|rodman|depth|stürmer|tan|blowout|desk|despair|shala|diaby|diana|diary|digg|digressives|dillard|discount|disease|diva|dope|dealer|dot-?com|ellis|downtown|drogba|dropbox|drudge|drunk|dslr|dude|dungeon|easier|dulwich|easter|editorial|editor|egalitarianism|jefe|dorado|ela|electricity|electronic|pitch|elevator|elitism|ring|engraving|enjolras|entertainment|novel|equality|equity|er|errorhoff|essential|estates|estramadura|ether|eurogamer|peron|evita|eworld|excellent|exclusivity|extreme|eyelash|f\.|facism|falco|falluja|fallujah|farm|faramir|farmer|fascism|fascist|fashion|fast|fax|ferrari|mignon|steak|filth|finance|fire|fireplace|fireside|grade|first|flat-?earther|flesh|fletcher|flexible|fondue|formal|fourteen|france|londons|dibnah|kruger|freedom|freelance|frontline|froyo|fuel|fundamentalism|funky|funny|reactor|future|gaige|galileo|gallery|gameplay|movie|gangster|garage|gatekeeping|geese|gentiles|bailey|georgia|german|germany|gestapo|ghettoblasters|ghey|gilberto|gillan|giving|gladioli|glamour|glasgow|glastonbury|glitter|godwin|going|sachs|golf|gollum|kiss|reader|gopher|tex|goretex|gospel|gothic|gourmet|governments|casey|govinda|taylor|grammar|grammars|grande|great|greece|grib?benes|gross|guantánamo|gun/cd"
cwb-scan-corpus -o freqs5b.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/gutenbergpress|gutenbergs|mary|hairdresser|handshake|hanley|hargo|hargreaves|stassen|harrods|potter|speech|hawaii|headline|heaven|hebrew|hellenism|helvetica|hemline|heretic|heroin|hi-?fi|hicks|hickton|hieroglyphs|diploma|musical|high|hipster|hitler|hitlers|holiday|holocaust|home|base|homeland|homeworking|hoola-?hoop|hot|hotmail|hotspots|html|humility|hunter-?gatherer|hvr|hybrid|hyypia|holloway|ibm|ibms|icarus|ice-?cream|iceberg|iceland|ics|ie|iluminati|imperialists|inbound|indians|individual|insanity|inside|fasting|iowa|ira|irish|ironic|italian|itunes|ballard|robinson|plastic|jazzercise|jean|jeans|lawrence|jersey|jew|jewellery|reference|carioca|lieberman|mccarthy|bircher|doerr|galt|updike|wayne|bahru|mcdaniels|journalist|juicing|jumanji|juninho|justice|bieber|kaka|karaoke|keane|reeves|kennedy|ketchup|shortcut|sanh|kimye|iii|king|kinsey|kissing|kkk|kleenex|kosovo|kp|krakow|labor|labourer|market|lease|las|disc|laserdisc|last\.fm|afternoon|latin|laughter|laursen|lebanon|leeds|leicester|vinci|leper|leprosy|hemsworth|libedems|libel|liberal|lie|light|lighter|lilac|allen|exchange|lipstick|lisp|literature|lolcat|armstrong|love-?ins|lowbrow|lp|lucas|walliams|lucky|lugaru|m\$|machiavelli|madonna|magic|mainframe|mainstream|majority|makalele|makelele|maldive|utd|consultancy|marabou|maradona|march|marianne|monroe|mario|trail|marmite|maroon|marshmallow|marxism|skiba|mauritius|mayfair|mba|mcdonald|mcgugan|mcmansion|medici|megabyte|megachurch|megapixel|mentorship|menu|mercury|merlin|meta|mexico|cera|foot|jackson|owen|windows|mid|mid-?lister|middleware|midichlorian|sweeney|edition|cd|million|minimalism|miniskirt|mink|minstrelsy|missionary|mississippi|mob|floral|modern|molotov|monarch|moneti[sz]ation|liner|carlo|monty|morality|more|morrissey|moscow|mosque|theresa|mouse|moustache|mph|clutch|grundy|mtv|mullet|multi-?culturalism|store|musketeer|mysql|mythology|n\*\*\*r|naked|nan|napster|nasa|nasser|nation-?states|nationalism|natural|naughty|nazi|nazii?sm|negro|kinnock|marcus|nero|netscape|neutral|guinea|hampshire|york|newfoundland|newspaper|newsstand|gingrich|nice|nick|saban|nicotine|mansell|nigga|nigger|nio|nixon|noah|chomsky|nobel|non-?permanent|norton|nostradamus|hill|cuisine|nr|o'connor-ginsburg|official|oj|old-?school|solskjaer|lozano|omarosa|opening|ophelia|opium|oprah|opt-?in|optimism|opulence|ordinary|organic|boy|orthodoxy|stage|ouija|outcome|outsourcing|somethin'|paganism|pakistani|paleo|palestine|pantomime|paparazzi|paperback|paramedic|paris|pascal|passivity|payphone|pda|peanut|pearl|hilton|perfect|perimeter|permanent|gulf/cd"
cwb-scan-corpus -o freqs6.gz -r "/Volumes/INTENSO/Corpora/ENCOW/cwb-registry" -f 10 ENCOWA lemma+0="/buckle|essay|pete|philosopher|album|photograph|lesson|pilates|floyd|pique|pirates|pitsmoor|pizza|plane|planking|planning|plantation|platinum|playing|playschool|shears|png|polenta|police|polo|pool|pop|pope|popstar|pork|rind|pornography|positive|postman|potato|power|suit|powerpoint|prague|prayer|prc|pre-?interview|pre-?school|prenzlauer-?berg|release|press-?bof|prezza|priesthood|priests|charles|private|line|productivity|profit|profligacy|prohibition|proletariat|property|proteins|protestantism|prozac|prussia|psg|publisher|pubs|contestant|pullover|punk|bands|purgatory|puritanism|puritan|put|pwl|quality|queer|tarantino|landlord|rack|rad|radicalism|radical|railway|push|random|rap-?battle|raspberry|raw|reagan|real|reality|realtree|reason|reckless|dress|kite|butter|remake|renfair|requiem|goalkeeper|r[eé]sum[eé][eé]?|revelation|krispies|rice|rich|wentworth|partridge|mute|up|ferdinand|risky|river|baron|hood|jerome|rock'n'roll|dangerfields|hammerstein|harris|stone|rolodex|circus|roman|rome|terry|ronaldo|parks|roswell|rotherham|oyal|royalty|rtfa|ruby|crowe|russian|franklin|s\.e\.o\.|sabbath|sacd|sad|saint|samizdat|francisco|pedro|sand|sane|sanskrit|satire|arabia|scam|scandinavia|schumi|scientology|doo|scooter|scotland|hinze|walker|script|kiddies|sd|sdr|seattle|secessionist|millionaire|sectarianism|seeing|segway|selfishness|senility|serfdom|serf|seti|sexiness|shaiks|sharia|shatner|shepherd|chino|shylock|sidewinder|silicon|sinatra|sincerity|sinner|siren|skinhead|skydiving|slashdot|ship|leia|slavery|slave|smalltalk|smash|smoking|snob|sodom|soil|soldier|somalia|somebody|sommelier|somoza|sondheim|sony|lauren|south|soviet|spam|spandex|spanking|vampire|speaker|speculation|spending|spielberg|spin|sporty|spouses|squidoo|tropez|staffies|stainless|stalin|stalingrad|stasi|quo|stereotype|martin|larsson|stop|storefront|straight|strange|donkey|thug|studio|cup|sub-?prime|fees|subscription|success|sudoku|lunch|sunray|super-?size|superheroes|superman|superstition|surrealism|boyle|suv|sweat-?shop|knife|swoosh|symbian|nemeth|t-?shirt|tables|taffeta|tcp|teal|don|telephone|temple|ian|pratchett|offensive|texan|textbook|theatre|thelma|theocracy|thin|speak|thirties|is|jefferson|tickle|tie|tiltshift|time-?travel|tioman|tits|tlc|today|toddlers|tolerance|drake|cooper|toner|tonto|stark|tories|torture|tory|totalitarianism|toughness|tower|toxic|publications|trans-?am|triangle|tribesmen|trophy|trousers|bonnaroo|tudors|mania|tuna|tuscany|tuxedo|twosome|u2|ubuntu|uffie|ugliness|ulay|uncool|undergrad|benefit|unicorn|usc|interface|ussr|vaccine|dentata|vanilla|vanity|var|vegan|1\.0|versailles|vertical|vidic|vietcong|vietnam|foster|vinyl|violin|virginity|visual|vlc|w3c|warlord|warm|warsaw|washing|wasp|cooler|watergate|watts|weakness|weapon|weimar|wheels|whirlow|whiskey|lund|whistler|whiteley|whore|widget|rogers|hearst|window|98|winona|winter|hunt|whistle|jing|woodstock|woodward|woody|word|wwe|x-?men|xbox|xtel|y2k|yamaha|yankee|page|yellowpage|youngster|youth|yule|z-?pak|zealot|zen|zenden|yimou|zippy/cd"

The frequency list was used to generate the queries for the actual vector-space analysis. For high-frequency items, the first 10,000 hits in ENCOWA were used. For less frequent items, hits from ENCOWB, ENCOWC, ENCOWD etc. were also taken into account. In all cases, the number of hits taken into account was capped at 10,000. Items that occur less than 10 times in ENCOWA were not taken into account at all as we assumed that even taking the other corpus batches into account, we would not be able to find a sufficient amount of hits. The results of our search can be found in the CSV sheet “collocates”.

# collocates
coll <- read_csv("../data/X_is_the_new_Y_distsem/collocates.csv")

## Rows: 8598 Columns: 1415
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (1): collocate
## dbl (1414): abnormal, abnormality, abortion, abstinence, abuse, academy, acc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We use the collocates to compute Positive Pairwise Mutual Information (PPMI), which in turn allows us to calculate the Cosine distance following Levshina (2015).

# "collocate" column to rownames:
coll <- as.data.frame(coll)
rownames(coll) <- coll$collocate
coll <- coll[ , -1] # remove first column
coll <- t(coll) # switch rows and columns
# get PPMI ----
# get expected frequencies
coll <- as.matrix(coll)
coll.exp <- chisq.test(coll)$expected

## Warning in chisq.test(coll): Chi-squared approximation may be incorrect

coll.PMI <- log2(coll / coll.exp)
coll.PPMI <- ifelse(coll.PMI < 0, 0, coll.PMI)
# get cosine similarity ---------------------------------------------------
# cosine similarity function
# (adopted from Levshina's [2015] Rling package)
nr <- nrow(coll)
m <- matrix(NA, nr, nr)
colnames(m) <- rownames(m) <- rownames(coll)
# for(i in 1:nr) {
#   for(j in 1:nr) {
#     cos <- crossprod(coll.PPMI[i, ], coll.PPMI[j, ])/sqrt(crossprod(coll.PPMI[i,
#     ]) * crossprod(coll.PPMI[j, ]))
#     m[i,j] <- cos
#     m[j,i] <- cos
#   }
#   
#   print(i)
# }
# export
# saveRDS(m, "m.Rds")
m <- readRDS("../data/X_is_the_new_Y_distsem/m.Rds")
# get distances
m2 <- 1 - (m / max(m[m<1]))
# backup copy
m2_matrix <- m2
# as.dist
m2 <- as.dist(m2)
# as matrix
m2_matrix <- as.matrix(m2, varnames = c("row", "col"))

Multidimensional Scaling

The resulting matrix is now used as input for multidimensional scaling:

# mds
m3 <- cmdscale(m2)
m3 <- rownames_to_column(as.data.frame(m3))
colnames(m3) <- c("Lemma", "dim1", "dim2")

Partitioning around medioids

For bottom-up identification of (potential) semantic groups, we use Partitioning Around Medioids (PAM).

# Clusters
m2_clust <- cluster::pam(m2, 14)
m2_cluster <- m2_clust$clustering %>% as.data.frame()
m2_cluster <- rownames_to_column(m2_cluster)
colnames(m2_cluster) <- c("Lemma", "Cluster")
# join dataframes
m3 <- left_join(m3, m2_cluster)

## Joining, by = "Lemma"

Visualization

The MDS and clustering information can be used for visualizing the results. In addition, we add frequency information to the plot:

# add frequency information
m3 <- left_join(m3, l, by = c("Lemma" = "word"))
m3$freq_x <- sapply(1:nrow(m3), function(i) length(which(d$lemma_head_x == m3$lemma[i])))
m3$freq_y <- sapply(1:nrow(m3), function(i) length(which(d$lemma_head_y == m3$lemma[i])))
m3$freq <- m3$freq_x + m3$freq_y
# add relative frequency with which each item occurs in x or y slot
m3$rel_x <- m3$freq_x / (m3$freq_x + m3$freq_y)
m3$rel_y <- m3$freq_y / (m3$freq_x + m3$freq_y)
m3 <- replace_na(m3, list(rel_x = 0, rel_y = 0))
# plot (only Freq >= 10 to keep plot readable)
# set a seed so that the location of the datapoints
# (arranged by ggrepel package) will remain the same
set.seed(1985)
#plot
(p1 <- ggplot(filter(m3, freq >= 5), aes(x = dim1, y = dim2, label = Lemma, col = factor(Cluster))) +
  geom_text_repel(aes(size = log1p(freq)*2), max.overlaps = 15) + 
    scale_color_discrete(terrain.colors(14)) +
  guides(col = "none", size = "none") + theme_bw() + theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 10)) )

# ggsave("distsem01.png", width = 8, height = 8)

Cosine distance between x and y slot

Another interesting metric is the cosine distance between the x and the y slot of the individual instances of the construction.

# add Cosine distance to original dataframe
d$cosine_distance <- NA
for(i in 1:nrow(d)) {
  if(d$lemma_head_x[i] %in% colnames(m2_matrix) &&
     d$lemma_head_y[i] %in% rownames(m2_matrix)) {
    d$cosine_distance[i] <- m2_matrix[which(colnames(m2_matrix) == d$lemma_head_x[i]),
                                      which(rownames(m2_matrix) == d$lemma_head_y[i])]
  }
}
# add column with x and y
d$lemma_heads <- character(nrow(d))
d$lemma_heads <- paste(d$lemma_head_x, d$lemma_head_y, sep = "/")
d %>% arrange(desc(cosine_distance)) %>% 
  select(lemma_head_x, lemma_head_y, cosine_distance) %>% na.omit %>% 
  unique %>% datatable() %>% formatSignif(columns = "cosine_distance", digits=3)

A different visualization option: Instead of showing clusters by color we show how often the datapoint occurs in the x or y slot.

# add relative freqeuency in x and y slot
m3$rel_x <- m3$freq_x / (m3$freq_x + m3$freq_y)
m3$rel_y <- m3$freq_y / (m3$freq_x + m3$freq_y)
m3 <- replace_na(m3, list(rel_x = 0, rel_y = 0))
# set a seed so that the location of the datapoints
# (arranged by ggrepel package) will remain the same
set.seed(1985)
#plot
(p2 <- ggplot(filter(m3, freq >= 5), aes(x = dim1, y = dim2, label = Lemma, col = rel_x)) +
  geom_text_repel(aes(size = log1p(freq)*2), max.overlaps = 15) + 
    scale_color_continuous(low = "blue", high = "red") +
  guides(col = "none", size = "none") + theme_bw() + theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 10)) )

# ggsave("distsem02.png", width = 8, height = 8)