---
title: "Numeral Classifiers in Yucatec Maya: Microvariation and syntactic change"
subtitle: "Supplementary Material"
author: "Barbara Blaha and Stavros Skopeteas"
date: "September 28, 2024"
output:
  pdf_document: 
    highlight: tango
    toc: true
    toc_depth: 2
    number_sections: true
    df_print: kable
    fig_width: 4
    fig_height: 3
    keep_tex: true
  html_document: default
---

# Global

```{r include=FALSE}  
knitr::opts_chunk$set(message=FALSE,warning=FALSE,attr.source='.numberLines') 
knitr::opts_chunk$set(
  fig.width = 12,    # Width of the figure in inches
  fig.height = 9,   # Height of the figure in inches
  dpi = 700         # High resolution
)
```

## Libraries
data processing
```{r, attr.source='.numberLines'}
  library(readxl)
  library(writexl)
  library(reshape2)
  library(scales)
  library(dplyr)
  library(stringr)
  library(showtext)#fonts
  library(kableExtra)
  library(broom)
```

graphs and maps
```{r}
  library(ggplot2)
  library(ggmap)
  library(ggspatial)
```
statistics
```{r}
  library(mgcv)
  library(mgcViz)
  library(itsadug)
```

## Settings
```{r}
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
theme_set(theme_bw())
#setting the size of annotations
annosize = 6
#downloads google font for graphs
font_add_google("Martel Sans", family = "special")
showtext_auto()
```
## Loading data
```{r}
load(file='YUC-CL.rda')

```

# Base map

This function creates a base map of Yucatán, based on the GADM data (https://gadm.org)

```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Base map"} 

  yuc.map.bare <- ggplot(data = gadm) +
    geom_sf(fill = "#faf9f8",color = "grey30",size=.1) + #FloralWhite
    theme(text = element_text(family = "special"),
          axis.text = element_text(size = 18, color="grey40"),
          axis.title = element_text(size = 22))+
    coord_sf(xlim = c(-91, -86.7), ylim = c(18.1, 21.8), expand = T)+
    scale_x_continuous(labels = abs,breaks = seq(-180,180,by=1),
                       minor_breaks = seq(-180,0,by=.1))+#,breaks = seq(-91,-87,1),
    scale_y_continuous(labels = abs,breaks = seq(0, 100, by=1),
                       minor_breaks = seq(0,100,by=0.2))+
    annotation_north_arrow(location = "br", which_north = "true", 
                           height = unit(0.7, "cm"),
                           width = unit(0.7, "cm"),
                           pad_x = unit(0.02, "in"), 
                           pad_y = unit(0.1, "in"),
                           style = north_arrow_fancy_orienteering(line_width = 0.4, 
                                                           line_col = "black",
                                                           fill = c("white", "grey30"),
                                                           text_col = "black",
                                                           text_size = 12))+
  annotation_scale(location = "br", 
                   width_hint = 0.12,
                   line_width = 0.2,
                   bar_cols = c("grey30", "white"),
                   pad_y = unit(0.1, "cm"),
                   pad_x = unit(0.1, "cm"),
                   text_cex = 1,
                   height = unit(0.1, "cm"))+
    labs(x="Longitude (°W)",y="Latitude (°N)")

  yuc.map <- yuc.map.bare+
    annotate("text",x=-90.4,y=20.05, label = "Camino\nReal",
             size = annosize, lineheight = 0.3,family="special")+
    annotate("text",x=-88,y=21.1, label = "Northeast",
             size = annosize, family="special")+
    annotate("text",x=-89.9,y=20.8, label = "Nortwest",
             size = annosize, family="special")+
    annotate("text",x=-89.9,y=19.2, label = "Los Ch'e'enes",
             size = annosize, family="special")+
    annotate("text",x=-89.1,y=20, label = "Southern\nYucatán",
             lineheight = 0.3,size = annosize)+
    annotate("text",x=-88.4,y=19.6, label = "Central\nQR",
             lineheight = 0.3,size = annosize)+
    annotate("text",x=-88.3,y=18.2, label = "BELIZE",size = annosize)+
    annotate("text",x=-88.6,y=21.3, label = "YUCATAN",size = annosize)+
    annotate("text",x=-88.4,y=19, label = "QUINTANA ROO",size = annosize)+
    annotate("text",x=-90,y=18.8, label = "CAMPECHE",size = annosize)
  yuc.map
```

# Functions

## Map graphs

The following function creates plots of gradient features on the base map. The maps contain 5 colors on a scale. The use of colors depends on the distribution of the data points in the dataset (see median-based distribution in p1...p5)

```{r}

plot.map.gradient <- function(data)
  { 
      caption <- substitute(data)
      data$Value.p <- rescale(data$Value.p)
      p5 <- 1
      p4 <- 1-(1-median(data$Value.p))/2
      p3 <- median(data$Value.p)
      p2 <- median(data$Value.p)/2
      p1 <- 0
      sum.plot <- yuc.map + 
        theme(text = element_text(family = "special",size=14))+
              geom_point(data = data, 
                 aes(x = Long, y = Lat,fill=Value.p),
                 size=3,
                 alpha=.9,
                 shape=21)+ 
        theme(legend.key.size = unit(0.1, 'lines'),
              legend.text = element_text(size=18,vjust= 1,lineheight = 0),
              legend.title = element_text(lineheight = 0.4, size=19,angle=0,hjust= 1),
              legend.position="top",
              legend.justification="right",
              legend.box.background = element_rect(color="grey30", size=.1)
              )+
        scale_fill_gradientn(values=c(p5, p4, p3, p2, p1), 
                       colours=c("#663300", "#996633", "#CCFFCC", "#99FF33", "#66CC00"),
                       guide = guide_colourbar(direction = "horizontal",
                                                   frame.colour="grey",
                                                   barheight = .7,
                                                   barwidth = 4,
                                                   ticks = T,
                                                   title.position = "left"),
                           breaks=c(0,1),
                           name="% general classifier,\n rescaled to [0,1]"
                          )

    
    print(sum.plot)
}
```

## Statistics

### Plotting GAM coefficients

```{r}
      my.cex = 2
      my.cex.lab = 3

plot.gam <- function(model) {
    caption = substitute(model)
      vis.plot <- vis.gam(model,
                        plot.type="contour", 
                        color="terrain",
                        too.far=0.15, 
                        view=c("Long","Lat"),
                        main="",
                        cex.axis=2,
                        cex.lab=my.cex.lab
                        )
    text(-89.8,21, label = "Nortwest",cex=my.cex)
    text(-90.3,y=20.05, labels = "Camino Real", xpd=TRUE,cex=my.cex)
    text(-88,21.2, labels = "Northeast", xpd=TRUE,cex=my.cex)
    text(-89.8,19.2, labels = "Los Ch'e'enes", xpd=TRUE,cex=my.cex)
    text(x=-89.1,y=20, labels = "Southern Yucatán", xpd=TRUE,cex=my.cex)
    text(x=-88.4,y=19.6, labels = "Central QR", xpd=TRUE,cex=my.cex)
    text(-88.3,18.5, labels = "Belize", xpd=TRUE,cex=my.cex)

    print(vis.plot)
}

```

Plotting GAM co-efficients with interactions of the smooth term

```{r}
plot.gam.2 <- function(model,condition="") {
    caption = substitute(model)
    caption.condition = substitute(condition)
        vis.plot <- vis.gam(model,
                            plot.type="contour", 
                            color="terrain",
                            cond=list(Subtype=condition),
                            too.far=0.15, 
                            view=c("Long","Lat"),
                            main="",
                            cex.axis=3, 
                            cex.lab=4
                            )
                      text(-89.8,21, label = "Nortwest",cex=my.cex)
                      text(-90.3,y=20.05, labels = "Camino Real", xpd=TRUE,cex=my.cex)
                      text(-88,21.2, labels = "Northeast", xpd=TRUE,cex=my.cex)
                      text(-89.8,19.2, labels = "Los Ch'e'enes", xpd=TRUE,cex=my.cex)
                      text(x=-89.1,y=20, labels = "Southern Yucatán", xpd=TRUE,cex=my.cex)
                      text(x=-88.4,y=19.6, labels = "Central QR", xpd=TRUE,cex=my.cex)
                      text(-88.3,18.5, labels = "Belize", xpd=TRUE,cex=my.cex)
    print(vis.plot)
}
```

### Density graph

```{r}
plot.density <- function(data,xlab = "n responses by speaker",bin = 2,xlim1=0,xlim2)
{ 
  caption <- substitute(data)
  density.plot <- ggplot(data,aes(Freq)) + 
    geom_histogram(aes(y=..density..),binwidth=bin, colour="black", fill="grey")+
    xlim(xlim1,xlim2)+
    geom_density(alpha=.1, fill="blue")+
    geom_vline(aes(xintercept=mean(Freq)),
               color="darkred", linetype="dashed", size=1)+
    labs(x=paste0(xlab, " (mean = ",
                  round(mean(data$Freq)),")"))+
    theme(text = element_text(size = 30, family = "special"))
  
  print(density.plot)
}


```


### Scatter plots

```{r}
plot.scatter <- function(data)
  { 
  myplot <- ggplot(result.type.2, aes(x=sum, y=percent,fill=Subtype)) +
    geom_smooth(method=lm, aes(color=Subtype,fill=Subtype),
                level=.50,linetype="dotted",se=F)+
    geom_jitter(aes(fill=Subtype),size=3, shape=21, width=0.5,height=0.5,alpha=0.7)+
    scale_x_continuous(trans = 'log2')+
    scale_colour_manual(values=c("grey","black","lightblue"),name = "Measure Type")+
    scale_fill_manual(values=c("grey","black","lightblue"),name = "Measure Type")+
    ylab("% multiple classifiers")+
    xlab("Frequency per classifier (log-scale)")+
    theme(axis.text = element_text(lineheight = 0.5, size=20),
          axis.title = element_text(lineheight = 0.5, size=24))+
    theme(legend.text = element_text(lineheight = 0,size=20),
        legend.title = element_text(size=24))+
    theme(legend.position="right",
          legend.box.background = element_rect(colour = "black"))
    print(myplot)

  }
```

# Data processing

For the following analysis, all relevant data was merged in the dataframe "dta":

## Labels

### Prompts

+ **Question_ID**: question identifier
+ **Question**: Form of the question
+ **Q_Label**: label of the question (including Question_ID)
+ **Translation**: translation of the question

### Prompt Groups

- **Type**: division of prompts in: sort | measure
- **TypePS**: division of prompts in: sort | part | portion & sum
- **TypePP**: division of prompts in: sorts | part & portion | portion
- **Subtype**: division of prompts in: sorts | part | portion | sum


### Speakers

+ **Speaker_ID**: speaker identifier
+ **Birthyear**: year of speaker's birth
+ **Gender**: M=male, F=female
+ **Spanish_odds**: Spanish:Maya ratio in speaker's data (in AYM)
+ **Long.jitter**: latitude of Location with jitter for each speaker (to avoid overplotting speakers of the same location)
+ **Lat.jitter**: latitude of Location with jitter for each speaker (to avoid overplotting speakers of the same location)

### Locations

+ **Location**: speaker's place of living
+ **Lat**: latitude of Location
+ **Long**: longitude of Location
+ **Pop.size**: population size of Location
+ **P.yuc**: p of indigenous speakers in Location

### Responses

+ **Form**: transcribed response
+ **Value**: classifier
+ **Variant**: variant. Annotations of the numeric expression: 
  - **CL.Gen**: general classifier p'éel;
  - **CL.Spec**: specific classifier, e.g., _túul_ 'CL.ANIM', _kúul_ 'CL.PLANT', _ts'íit_ 'CL.LONG', _maataj_ 'stem' (measure noun from Spanish), _che'_ 'tree', _xéek_ 'CL.FOOT_OF_TREE'; 
  - **CL.Gen CL.Spec**: general classifier followed by specific classifiers; 
  - **NV**: non-valid; 
  - **NV_noData**: no data available; 
  - **NV_noNoun**: no noun available; 
  - **NV_noNum**: no numeric expression, e.g., un hombre rendered as _e máako'_ ; intended numeric expressions translated with the vague quantifier _ya'ab_ 'much/many';
  - **NV_noClass**: no classifier available;
  - **NV_Spanish**: Spanish syntax; 
  - The same lexical item may occur as CL.Spec and as N. It is coded as CL.Spec if it appears in a classifier construction, i.e., it is accompanied by an N without bearing possessive morphology, e.g., _jum p'éel maata wayúum_ (one CL.UNIT stem huaya) 'one stem huaya'; the same lexical item is the head N of an NP if it bears possessive morphology, e.g., _jum p'éel u maata-il wayúum_ (one CL.UNIT A.3 stem-REL huaya) 'one stem of huaya'. 
  - In case of multiple responses: the annotation corresponds to the last valid answer.

+ **Comment**: explanations for Non-Valid data
+ **Structure**: structural representation [Annotation of the structure of the numeral expression at issue: 
  - **A**: set A person marker; 
  - **Adj**: Adjective (annotated in cases that it needs to be distinguished from classifiers), e.g., Q167 _um p'ée wolis sakan_ (one CL.UNIT round dough); also deverbal adjectives in _-bil_ and _-Vkbal_, which are not classifiers 
  - **Adj.Size**: size modifier (e.g., chan): only these modifiers are annotated, to test how often they intervene between numerals and classifiers; 
  - **CL.Gen**: general classifier _p'éel_ (CL.UNIT); 
  - **CL.Spec**: specific classifier (sortal/mensural) or measure noun; 
  - **D**: Determiner; 
  - **N**: noun; 
  - **Num**: numeral; 
  - **(Num)**: phonologically zero numeral (only applies to 'one'). 
  - **Rel**: relationalizer _-il/-ij/-i_ (evidence for a possessive construction);
  - In case of multiple responses: the annotation corresponds to the last valid answer; <Prep>: preposition; 

+ **Origin**: origin of the classifier (Maya, Spanish) [Origin of classifier: this annotation refers to the leftmost specific classifier: 
  - **Spanish**: Spanish origin, e.g., _maataj_ 'stem', _fiilaj_ 'row'; 
  - **Maya**: Maya origin, e.g., _p'éel_ 'CL.Unit', _túul_ 'CL.Anim', 
  - **NV**: not applicable if no specific classifier appears.

## Data


```{r}
kable(head(dta)) %>%
  kable_styling("striped", full_width = F) %>%
  scroll_box(width = "100%", height = "400px")
```  

## Sample

```{r}
speakers.df <- unique(select(dta,Speaker_ID,Birthyear,Gender))
speakers.n <- nrow(speakers.df)

#locations
locations.n <- length(unique(dta$Location))

#age
speakers.sum <- summary(speakers.df$Birthyear)
speakers.gender.sum <- summary(as.factor(speakers.df$Gender))

#summary
speakers.summary <- data.frame(info=c("total speakers",
                                      "female speakers",
                                      "locations",
                                      "birthyear: min",
                                      "birthyear: max",
                                      "birthyear: mean",
                                      "birthyear: median"),
           n=round(c(speakers.n,speakers.gender.sum[2],locations.n,
                     speakers.sum[1],speakers.sum[6],speakers.sum[4],speakers.sum[3]),0))

  kable(speakers.summary) %>%
  kable_styling("striped", full_width = F)

```

## Responses

```{r}
questions.n <- length(unique(dta$Question_ID))
speakers.n <- length(unique(dta$Speaker_ID))
permutations.n <- speakers.n * questions.n
responses.n <- nrow(dta)
responses.p <- responses.n*100/permutations.n

#table
responses.summary <- data.frame(info=c("n questions","n speakers","n questions*speakers",
                                       "n responses","% responses (out of n permutations)"),
           n=round(c(questions.n,speakers.n,permutations.n,responses.n,responses.p),0))

  kable(responses.summary) %>%
  kable_styling("striped", full_width = F)

```

### Non-valid data

```{r}
nv <- data.frame(xtabs(~Variant+Comment,dta))

  kable(nv) %>%
  kable_styling("striped", full_width = F)
  
```
```{r}
nv.n <-  sum(nv$Freq)
nv.content.n <-  sum(nv[grepl("Content at issue", nv$Comment),]$Freq)
nv.construction.n <-  sum(nv[grepl("Construction at issue", nv$Comment),]$Freq)

nv.summary <- data.frame(info=c("n non-valid",
                                "n non-valid: content",
                                "n non-valid: construction"),
           n=round(c(nv.n,nv.content.n,nv.construction.n),0))

  kable(nv.summary) %>%
  kable_styling("striped", full_width = F)

```

excluding non valid data; dataset contains:
```{r}
dta.v1 <- droplevels(subset(dta,Variant != "NV"))
  nrow(dta.v1)
```

responses with Spanish classifiers
```{r}
#spanish  
dta.spa <- droplevels(subset(dta.v1,Origin == "Spanish"))
  nrow(dta.spa)
```

excluding responses with Spanish classifiers
```{r}
dta.v2 <- droplevels(subset(dta.v1,Origin != "Spanish"))
```
  
excluding speakers who do not have at least one valid token for sortal and one valid token for mensural classifiers
```{r}
  speakers.n <- length(unique(dta.v2$Speaker_ID))
  v.speaker <- data.frame(xtabs(~Type+Speaker_ID,dta.v2))
  v.speaker.cast <- dcast(v.speaker, ... ~ Type,value.var = "Freq")
  nv.speakers <- v.speaker.cast[v.speaker.cast$sort == 0 | 
                                  v.speaker.cast$measure == 0, ]$Speaker_ID

dta.v <- droplevels(subset(dta.v2,!(Speaker_ID %in% nv.speakers)))
```

### Valid data
n and % valid
```{r}
  valid.n <- nrow(dta.v)
  valid.prop <-   100*valid.n/responses.n
  valid.n

```

n valid responses per question
```{r,fig.fullwidth=T,warning=FALSE, fig.cap="Figure: n valid per question"}  
  questions.valid <- data.frame(xtabs(~Question_ID,dta.v))
  summary(questions.valid$Freq)
```


```{r, out.width="70%", warning=FALSE, results= 'hide', fig.cap="Figure: Histogramm per questions"} 
  plot.density(questions.valid, xlab = "n responses by question",bin = 10,xlim2=180)
```

n valid responses per speaker
```{r}  
  speakers.valid <- data.frame(xtabs(~Speaker_ID,dta.v))
  summary(speakers.valid$Freq)
```

```{r, out.width="70%", warning=FALSE, results= 'hide', fig.cap="Figure: Histogramm per speakers"} 
  plot.density(speakers.valid,xlim2=25)
```

### classes of prompts
Prompt labels are listed for Appendix I
```{r}
kable(unique(dta.v[dta.v$Subtype =="sort",]$Question_Label)) %>%
  kable_styling("striped", full_width = F)

kable(unique(dta.v[dta.v$Subtype =="portion",]$Question_Label)) %>%
  kable_styling("striped", full_width = F)

kable(unique(dta.v[dta.v$Subtype =="part",]$Question_Label)) %>%
  kable_styling("striped", full_width = F)

kable(unique(dta.v[dta.v$Subtype =="sum",]$Question_Label)) %>%
  kable_styling("striped", full_width = F)

```
  
# Descriptive stats

Creating two subsets of the data -- for sortal and mensural classifiers:
```{r}
  dta.v.sort <- dta.v[dta.v$Type =="sort",]
  dta.v.meas <- dta.v[dta.v$Type =="measure",]

```

## Sortal classifiers

### Frequencies

reporting frequencies of sortal classifiers
```{r}
  sortal_table <- dta.v.sort %>%
    group_by(Variant) %>%
    summarise(frequency = n()) %>%
    mutate(percentage = round(frequency*100 / sum(frequency),1))

kable(sortal_table) %>%
  kable_styling("striped", full_width = F)
    
```

excluding multiple classifiers from this analysis

```{r}
  dta.v.sort <- droplevels(subset(dta.v.sort, Variant != "CL.Gen CL.Spec"))
  dta.v.sort$Variant <- factor(dta.v.sort$Variant, levels = c("CL.Spec","CL.Gen"))

```

### Use of the general classifier

Aggregating by speaker and by location and ploting
```{r}
  
# per speaker
  dta.v.sort$Substitute.p <- as.numeric(as.factor(dta.v.sort$Variant))-1
  dta.v.sort.aggr.speaker <-aggregate(Substitute.p ~ Speaker_ID+Long+Lat, 
                                data= dta.v.sort[,c("Speaker_ID","Lat","Long",
                                                    "Substitute.p")],FUN=mean)

# per location
  dta.v.sort.aggr <-aggregate(Substitute.p ~ Location+Long+Lat, 
                                data= dta.v.sort[,c("Location","Lat","Long",
                                                    "Substitute.p")],FUN=mean)
  summary(dta.v.sort$Substitute.p)
  nrow(dta.v.sort.aggr)
  dta.v.sort.aggr$Value.p <- dta.v.sort.aggr$Substitute.p
```

```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Sortal classifiers on base map"} 
  
    plot.map.gradient(dta.v.sort.aggr)
```

### List of sortal classifiers

```{r}
  dta.v.sort$sortalCL <- dta.v.sort$Value
  dta.v.sort$sortalCL <- gsub(" .*","",dta.v.sort$sortalCL)
  
  sortal.classifiers <- as.data.frame(xtabs(~sortalCL, dta.v.sort))
  kable(sortal.classifiers) %>%
  kable_styling("striped", full_width = F)
  
  sum(sortal.classifiers$Freq)
  
```

## Mensural classifiers

### Data

Creating a frequency table for mensural classifiers
```{r}
  nrow(dta.v.meas)

## prompts 
  mensural_table <- dta.v.meas %>%
    group_by(Variant) %>%
    summarise(frequency = n()) %>%
    mutate(percentage = round(frequency*100 / sum(frequency),1))
kable(mensural_table) %>%
  kable_styling("striped", full_width = F)
  
```

Creating a frequency table for mensural classifiers by measure type
```{r}
  subtype_table <- dta.v.meas %>%
    group_by(Subtype,Variant) %>%
    summarise(frequency = n()) 
  subtype.df <- dcast(data.frame(subtype_table), ... ~Variant)
  subtype.df$sum <- subtype.df$'CL.Gen CL.Spec'+ subtype.df$CL.Spec
  subtype.df$pCL.Gen <- round(100*subtype.df$'CL.Gen CL.Spec'/subtype.df$sum,1)
  subtype.df$pCL.Spec <- round(100*subtype.df$CL.Spec/subtype.df$sum,1)

  kable(subtype.df) %>%
  kable_styling("striped", full_width = F)
  
```

### Use of multiple classifiers

Aggregating by speaker and by location and ploting

```{r}
  dta.v.meas$Variant <- factor(dta.v.meas$Variant, levels = c("CL.Spec","CL.Gen CL.Spec"))
  
  dta.v.meas$Insert.p <- as.numeric(dta.v.meas$Variant)-1
  dta.v.meas.aggr.speaker <-aggregate(Insert.p ~ Speaker_ID+Long.jitter+Lat.jitter, 
                                        data= dta.v.meas[,c("Speaker_ID",
                                                            "Lat.jitter",
                                                            "Long.jitter",
                                                            "Insert.p")],
                                        FUN=mean)
  dta.v.meas.aggr <-aggregate(Insert.p ~ Long+Lat, 
                                        data= dta.v.meas[,c("Lat","Long","Insert.p")],
                                        FUN=mean)
  dta.v.meas.aggr$Value.p <- dta.v.meas.aggr$Insert.p
  dta.v.meas.aggr.speaker$Value.p <- dta.v.meas.aggr.speaker$Insert.p
  summary(dta.v.meas.aggr$Value.p)
  summary(dta.v.meas.aggr.speaker$Value.p)
```

```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Mensural classifiers on base map"} 
  plot.map.gradient(dta.v.meas.aggr)
  
```


Entering the p of substituting sortal by general to the mensural classifier table (for statistic evaluations)
```{r}
  dta.v.meas <- merge(dta.v.meas,
                      dta.v.sort.aggr.speaker[,c("Speaker_ID","Substitute.p")],
                      by="Speaker_ID",
                       all.x = T)
```


### List of mensural classifiers

Creating lists of classifiers for the Appendix
```{r}
  dta.v.meas$mensuralCL <- dta.v.meas$Value
  dta.v.meas$mensuralCL <- str_remove(dta.v.meas$mensuralCL, "p'éel ")
  dta.v.meas$mensuralCL <- gsub(" .*","",dta.v.meas$mensuralCL)
  
  result.type <- as.data.frame(xtabs(~mensuralCL+Variant+Subtype, dta.v.meas))
  result.type.2 <- dcast(result.type, mensuralCL + Subtype ~ Variant)
  result.type.2$sum <- result.type.2$'CL.Gen CL.Spec'+result.type.2$CL.Spec
  result.type.2$percent <- 100*result.type.2$'CL.Gen CL.Spec'/result.type.2$sum
  result.type.2 <- droplevels(subset(result.type.2,percent != "NaN"))
  kable(result.type.2) %>%
  kable_styling("striped", full_width = F) %>%
  scroll_box(width = "100%", height = "200px")
  
```


### Frequency of mensural in multiple classifiers

create scatter plot

```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Scatter plot for Measure types"} 

  plot.scatter(result.type.2)  

```

## Inferential stats: Generalized additive models (GAM)

### Sortal Classifiers

Maximal model:
```{r}
  
gam.sortal.0 = mgcv::gam((Variant=="CL.Gen") ~ 
                             Spanish_odds +
                             Birthyear +
                             Pop.size +
                           s(Long, Lat, bs="tp",k=15), 
                         data=dta.v.sort,method="ML")
print(summary(gam.sortal.0))    
```

Model comparisons:
```{r,fig.width=8, fig.height=5,fig.fullwidth=T,warning=FALSE, fig.cap="Figure: Space coefficients for sortal classifiers"}  
gam.sortal.1 <- update(gam.sortal.0, . ~ . -Pop.size)
    compareML(gam.sortal.0,gam.sortal.1,suggest.report = T)
gam.sortal.2 <- update(gam.sortal.1, . ~ . -Birthyear)
    compareML(gam.sortal.1,gam.sortal.2,suggest.report = T)
gam.sortal.3 <- update(gam.sortal.2, . ~ . -Spanish_odds)
    compareML(gam.sortal.2,gam.sortal.3,suggest.report = T)
```

Coefficients of winner model
```{r}  
  print(summary(gam.sortal.2))
```

Coefficients of smooth term of Space
```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Space coefficients for Sorts"} 
plot.gam(gam.sortal.2)
  
```

### Mensural classifiers

Contrast coding: successive differences of subtypes
```{r}
  dta.v.meas$Subtype <- factor(dta.v.meas$Subtype)
  contrasts(dta.v.meas$Subtype) <- MASS::contr.sdif(3)
  
```

model.0: Maximal model
```{r}
  gam.mensural.0 = mgcv::gam((Variant=="CL.Gen CL.Spec") ~ 
                             Substitute.p +
                             Subtype +
                             Spanish_odds +
                             Birthyear +
                             Pop.size +
                             s(Long, Lat, bs="tp",k=15,by=Subtype), 
                           data=dta.v.meas,method="ML")
  print(summary(gam.mensural.0))    
```
Removing SPACE : MEASURE TYPE 

```{r}
  gam.mensural.0.interaction = mgcv::gam((Variant=="CL.Gen CL.Spec") ~ 
                             Substitute.p +
                             Subtype +
                             Spanish_odds +
                             Birthyear +
                             Pop.size +
                             s(Long, Lat, bs="tp",k=15), 
                           data=dta.v.meas,method="ML")
  print(summary(gam.mensural.0.interaction))    
```

Model comparison, winner: model.0
```{r}
  compareML(gam.mensural.0,gam.mensural.0.interaction,suggest.report = T)
```

Removing POPULATION SIZE

Model comparison: non-significant

```{r}
  gam.mensural.1 <- update(gam.mensural.0, . ~ . -Pop.size)
    compareML(gam.mensural.0,gam.mensural.1,suggest.report = T)
```

Removing TIME

Model comparison: non-significant

```{r}
  gam.mensural.2 <- update(gam.mensural.1, . ~ . -Birthyear)
    compareML(gam.mensural.1,gam.mensural.2,suggest.report = T)
```

Removing SPANISH ODDS

Model comparison: significant

```{r}
  gam.mensural.3 <- update(gam.mensural.2, . ~ . -Spanish_odds)
    compareML(gam.mensural.2,gam.mensural.3,suggest.report = T)#win 2
```

Removing MEASURE TYPE

Model comparison: significant

```{r}
  gam.mensural.4 <- update(gam.mensural.2, . ~ . -Subtype)
    compareML(gam.mensural.2,gam.mensural.4,suggest.report = T)#win 2
```

Collapsing Parts and Sums

Model comparison: significant

```{r}
  gam.mensural.41 <- update(gam.mensural.4, . ~ . +TypePS)
    compareML(gam.mensural.2,gam.mensural.41,suggest.report = T)#win 2
```

Collapsing Parts and Portions

Model comparison: significant

```{r}
  gam.mensural.42 <- update(gam.mensural.4, . ~ . +TypePP)
    compareML(gam.mensural.2,gam.mensural.42,suggest.report = T)#win 2
```

Removing Likelihood of Substitution

Model comparison: significant

```{r}
  gam.mensural.5 <- update(gam.mensural.2, . ~ . -Substitute.p)
    compareML(gam.mensural.2,gam.mensural.5,suggest.report = T)
```

Coefficients of winner model

```{r}  
  summary(gam.mensural.2)
  
```

Coefficients of the smooth term of space

```{r, out.width="80%", warning=FALSE, results= 'hide', fig.cap="Figure: Space coefficients for Parts"}  
  plot.gam.2(gam.mensural.2,"part")

```
```{r, out.width="80%", warning=FALSE, results= 'hide',fig.cap="Figure: Space coefficients for Sums"}  
  plot.gam.2(gam.mensural.2,"sum")

```

```{r, out.width="80%", results= 'hide', warning=FALSE, fig.cap="Figure: Space coefficients for Portions"}  
  plot.gam.2(gam.mensural.2,"portion")

```


# References

## R and R Version

```{r}
version[['version.string']]
```

```{r}
print(citation())
```


## Packages

```{r}

packages <- c("readxl","writexl","reshape2","scales","dplyr",
              "stringr","showtext","ggplot2","ggmap","ggspatial",
              "mgcv","mgcViz","itsadug")

knitr::write_bib(c(packages), width = 60)
```

 --- end of documentation ---