Introducción

Esto es una versión de la presentación utilizada en la sesión Análisis estadístico y visualización de datos del workshop de data science, incluyendo el código que por simplicidad y espacio no se ha mostrado en la presentación.

Análisis estadístico y visualización de datos

Objetivo:

Análisis estadístico y visualización de datos

Advertencia: Las conclusiones valen sobre sólo los datos. Para conclusiones sobre el conjunto de la población, inferencia estadística.

Los datos del Titanic como ejemplo

El RMS Titanic transportaba 2.435 pasajeros y 892 miembros de la tripulación cuando se hundió el 10 de abril de 1912 dejando unos 700 supervivientes.

En esta sesión utilizaremos como ejemplo los datos de una parte de los pasajeros.

library(readr)
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(ggmosaic)

# Preparando datos
titanic <- read_csv("titanic_data.txt")
titanic <- mutate(titanic, 
                  passenger.class = fct_recode(as.factor(pclass),
                                               "1st" = "1", "2nd" = "2", "3rd" = "3"),
                  survival = fct_recode(as.factor(survived), 
                                        "died" = "0", "lived" = "1"))
titanic$class <- as.factor(titanic$pclass)
titanic$age_group <- ifelse(titanic$age>=18, ">=18", "<18")

Survival

addmargins(table(titanic$survival))
## 
##  died lived   Sum 
##   809   500  1309
table(titanic$survival)/sum(table(titanic$survival))
## 
##     died    lived 
## 0.618029 0.381971

Survival

ggplot(titanic)+
  geom_bar(aes(x=survival, fill=survival))

Sex

addmargins(table(titanic$sex))
## 
## female   male    Sum 
##    466    843   1309
table(titanic$sex)/sum(table(titanic$sex))
## 
##    female      male 
## 0.3559969 0.6440031

Sex

ggplot(titanic)+
  geom_bar(aes(x=sex, fill=sex))

Sex and Survival

addmargins(with(titanic, table(sex,survival)))
##         survival
## sex      died lived  Sum
##   female  127   339  466
##   male    682   161  843
##   Sum     809   500 1309

Sex and Survival

ggplot(titanic) +
  geom_bar(aes(sex, fill = survival))

Sex and Survival

ggplot(titanic) + 
  geom_mosaic(aes(x = product(sex), fill = survival))

Sex and Survival

addmargins(with(titanic, proportions(table(sex,survival), margin=1)), margin=2)
##         survival
## sex           died     lived       Sum
##   female 0.2725322 0.7274678 1.0000000
##   male   0.8090154 0.1909846 1.0000000
addmargins(with(titanic, proportions(table(sex,survival), margin=2)), margin=1)
##         survival
## sex           died     lived
##   female 0.1569839 0.6780000
##   male   0.8430161 0.3220000
##   Sum    1.0000000 1.0000000

Class

addmargins(table(titanic$class))
## 
##    1    2    3  Sum 
##  323  277  709 1309
table(titanic$class)/sum(table(titanic$class))
## 
##         1         2         3 
## 0.2467532 0.2116119 0.5416348

Class

ggplot(titanic)+
  geom_bar(aes(x=class, fill=class))

Class and Survival

ggplot(titanic) +
  geom_bar(aes(class, fill = survival))

Sex and Survival

addmargins(with(titanic, proportions(table(class,survival))))
##      survival
## class       died      lived        Sum
##   1   0.09396486 0.15278839 0.24675325
##   2   0.12070283 0.09090909 0.21161192
##   3   0.40336134 0.13827349 0.54163484
##   Sum 0.61802903 0.38197097 1.00000000

Sex and Survival

addmargins(with(titanic, proportions(table(class,survival), margin=1)), margin=2)
##      survival
## class      died     lived       Sum
##     1 0.3808050 0.6191950 1.0000000
##     2 0.5703971 0.4296029 1.0000000
##     3 0.7447109 0.2552891 1.0000000
addmargins(with(titanic, proportions(table(class,survival), margin=2)), margin=1)
##      survival
## class      died     lived
##   1   0.1520396 0.4000000
##   2   0.1953028 0.2380000
##   3   0.6526576 0.3620000
##   Sum 1.0000000 1.0000000

Class and Survival

ggplot(titanic) +
  geom_bar(aes(class, fill = survival))

Sex, class and survival

ggplot(titanic) +
  geom_bar(aes(sex, fill = survival))+
  facet_grid(cols=vars(class))

## , , class = 1
## 
##         survival
## sex      died lived
##   female    5   139
##   male    118    61
## 
## , , class = 2
## 
##         survival
## sex      died lived
##   female   12    94
##   male    146    25
## 
## , , class = 3
## 
##         survival
## sex      died lived
##   female  110   106
##   male    418    75

Age

summary(titanic$age, useNA="always")
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.1667 21.0000 28.0000 29.8811 39.0000 80.0000     263

Age

ggplot(titanic)+
  geom_histogram(aes(x=age), na.rm=TRUE)

Age & survival

ggplot(titanic)+
  geom_histogram(aes(x=age, fill=survival), na.rm=TRUE)

Age, sex, class and survival

ggplot(titanic) +
  geom_histogram(aes(x=age, fill = survival))+
  facet_grid(cols=vars(class), rows=vars(sex))

Age, sex, class and Survival

ggplot(titanic) +
  geom_bar(aes(age_group, fill = survival))+
  facet_grid(cols=vars(pclass))

Age, sex, class and Survival

ggplot(titanic) +
  geom_bar(aes(x=age_group, fill = survival))+
  facet_grid(cols=vars(class), rows=vars(sex))

Fare

ggplot(titanic)+
  geom_histogram(aes(x=fare), na.rm=TRUE)

Fare and class

ggplot(titanic)+
  geom_histogram(aes(x=fare, fill=class), na.rm=TRUE)

Fare and class

ggplot(titanic)+
  geom_boxplot(aes(x=class, y=fare, fill=class), na.rm=TRUE)

Fare and class

summary(titanic$fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.896  14.454  33.295  31.275 512.329       1
aggregate(fare~class, data=titanic, summary)
##   class fare.Min. fare.1st Qu. fare.Median fare.Mean fare.3rd Qu. fare.Max.
## 1     1   0.00000     30.69580    60.00000  87.50899    107.66250 512.32920
## 2     2   0.00000     13.00000    15.04580  21.17920     26.00000  73.50000
## 3     3   0.00000      7.75000     8.05000  13.30289     15.24580  69.55000

Fare and age

ggplot(titanic)+
  geom_point(aes(x=age,y=fare), na.rm=TRUE)

Fare and age

ggplot(titanic)+
  geom_point(aes(x=age,y=fare), na.rm=TRUE)+
  geom_smooth(aes(x=age,y=fare), method="lm", na.rm=TRUE)

Fare and age

coefficients(lm(fare~age, data=titanic))
## (Intercept)         age 
##  16.0197563   0.6922966

\(\hat{fare} = 16.0197563 + 0.6922966 \cdot age\)

Fare, age and class

ggplot(titanic)+
  geom_point(aes(x=age,y=fare, color=class), na.rm=TRUE)+
  geom_smooth(aes(x=age,y=fare, color=class), method="lm", na.rm=TRUE)

Fare, age and class

ggplot(titanic)+
  geom_point(aes(x=age,y=fare, color=class), na.rm=TRUE)+
  geom_smooth(aes(x=age,y=fare, color=class), method="lm", na.rm=TRUE)+
  geom_smooth(aes(x=age,y=fare), method="lm", na.rm=TRUE, color="black")

Fare, age and class

coefficients(lm(fare~age*class, data=titanic))
## (Intercept)         age      class2      class3  age:class2  age:class3 
## 116.2097283  -0.6123703 -88.9199766 -98.4956718   0.4281848   0.4169871

1st class: \(\hat{fare} = 116.2097283 -0.6123703 \cdot age\)

2nd class: \(\hat{fare} = 27.2897517 -0.1841855 \cdot age\)

3rd class: \(\hat{fare} = 17.7140565 -0.1953832 \cdot age\)

Age and class

ggplot(titanic)+
  geom_histogram(aes(x=age, fill=class), na.rm=TRUE)

Age and class

ggplot(titanic)+
  geom_boxplot(aes(x=class, y=age, fill=class), na.rm=TRUE)

Lecturas adicionales - 1

Lecturas adicionales - 2