Ejemplo análisis exploratorios genómica de poblaciones

SNPRelate

Instalar (correr solo una vez, lo dejo comentado para su referencia)

#source("http://bioconductor.org/biocLite.R")
#biocLite("gdsfmt")
#biocLite("SNPRelate")

Cargar paquetes que se utilizarán

library(SNPRelate)

## Loading required package: gdsfmt

## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)

library(ape)
library(ggplot2)

Cargar datos

##### Para usar con SNPRelate
## Crear datos en formato gds a partir de plink
snpgdsBED2GDS("../data/maicesArtegaetal2015.bed", 
              "../data/maicesArtegaetal2015.fam", 
              "../data/maicesArtegaetal2015.bim", 
              out.gdsfn="../data/maicesArtegaetal2015.gds", 
              option = snpgdsOption(Z=10)) # 10 cromosomas

## Start snpgdsBED2GDS ...
##  BED file: "../data/maicesArtegaetal2015.bed" in the SNP-major mode (Sample X SNP)
##  FAM file: "../data/maicesArtegaetal2015.fam", DONE.
##  BIM file: "../data/maicesArtegaetal2015.bim", DONE.
## Thu Feb  7 12:17:43 2019     store sample id, snp id, position, and chromosome.
##  start writing: 165 samples, 36931 SNPs ...
##      Thu Feb  7 12:17:43 2019    0%
##      Thu Feb  7 12:17:44 2019    100%
## Thu Feb  7 12:17:44 2019     Done.
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file '../data/maicesArtegaetal2015.gds' (1.6M)
##     # of fragments: 39
##     save to '../data/maicesArtegaetal2015.gds.tmp'
##     rename '../data/maicesArtegaetal2015.gds.tmp' (1.6M, reduced: 252B)
##     # of fragments: 18

# Ver resumen (esto no carga el archivo)
snpgdsSummary("../data/maicesArtegaetal2015.gds")

## Some values of 'snp.chromosome' are invalid (should be finite and >= 1)!

## Hint: specifying 'autosome.only=FALSE' in the analysis could avoid detecting chromosome coding.

## Some of 'snp.allele' are not standard (e.g., 0/G).

## The file name: /Users/ticatla/hubiC/Science/Teaching/Mx/BioinfInvgRepro/BioinfinvRepro/Unidad5/Prac_Uni5/maices/data/maicesArtegaetal2015.gds 
## The total number of samples: 165 
## The total number of SNPs: 36931 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
## The number of valid samples: 165 
## The number of biallelic unique SNPs: 30804

# Cargar archivo para trabajar con el
genofile <- snpgdsOpen("../data/maicesArtegaetal2015.gds")

# Check snp.ids
head(read.gdsn(index.gdsn(genofile, "snp.id")))

## [1] "abph1.15"  "ae1.8"     "an1.3"     "ba1.5"     "ba1.7"     "csu1138.4"

# Check sample.ids
head(read.gdsn(index.gdsn(genofile, "sample.id")))

## [1] "maiz_3"  "maiz_68" "maiz_91" "maiz_39" "maiz_12" "maiz_41"

# Obtener nombres muestras del gdsn
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
sample.id

##   [1] "maiz_3"    "maiz_68"   "maiz_91"   "maiz_39"   "maiz_12"  
##   [6] "maiz_41"   "maiz_35"   "maiz_58"   "maiz_51"   "maiz_82"  
##  [11] "maiz_67"   "maiz_93"   "maiz_21"   "maiz_6"    "maiz_101" 
##  [16] "maiz_27"   "maiz_43"   "maiz_1"    "maiz_33"   "maiz_100" 
##  [21] "maiz_24"   "maiz_103"  "maiz_72"   "maiz_10"   "maiz_28"  
##  [26] "maiz_49"   "maiz_56"   "maiz_66"   "maiz_52"   "maiz_47"  
##  [31] "maiz_80"   "maiz_65"   "maiz_94"   "maiz_36"   "maiz_26"  
##  [36] "maiz_105"  "maiz_30"   "maiz_16"   "maiz_42"   "maiz_4"   
##  [41] "maiz_31"   "maiz_17"   "maiz_46"   "maiz_5"    "maiz_32"  
##  [46] "maiz_19"   "maiz_50"   "maiz_8"    "maiz_34"   "maiz_23"  
##  [51] "maiz_54"   "maiz_14"   "maiz_37"   "maiz_25"   "maiz_55"  
##  [56] "maiz_40"   "maiz_29"   "maiz_60"   "maiz_44"   "maiz_74"  
##  [61] "maiz_89"   "maiz_64"   "maiz_83"   "maiz_75"   "maiz_92"  
##  [66] "maiz_69"   "maiz_84"   "maiz_76"   "maiz_97"   "maiz_70"  
##  [71] "maiz_85"   "maiz_77"   "maiz_71"   "maiz_86"   "maiz_78"  
##  [76] "maiz_102"  "maiz_73"   "maiz_88"   "maiz_79"   "maiz_106" 
##  [81] "maiz_119"  "maiz_148"  "maiz_108"  "maiz_120"  "maiz_151" 
##  [86] "maiz_134"  "maiz_123"  "maiz_153"  "maiz_135"  "maiz_124" 
##  [91] "maiz_184"  "maiz_140"  "maiz_125"  "maiz_141"  "maiz_131" 
##  [96] "maiz_189"  "maiz_57"   "maiz_126"  "maiz_113"  "maiz_138" 
## [101] "maiz_63"   "maiz_127"  "maiz_114"  "maiz_139"  "maiz_99"  
## [106] "maiz_129"  "maiz_116"  "maiz_142"  "maiz_110"  "maiz_132" 
## [111] "maiz_118"  "maiz_144"  "maiz_133"  "maiz_121"  "maiz_111" 
## [116] "maiz_137"  "maiz_109"  "maiz_146"  "maiz_164"  "maiz_157" 
## [121] "maiz_171"  "maiz_149"  "maiz_165"  "maiz_159"  "maiz_172" 
## [126] "maiz_150"  "maiz_166"  "maiz_160"  "maiz_173"  "maiz_152" 
## [131] "maiz_167"  "maiz_161"  "maiz_174"  "maiz_154"  "maiz_169" 
## [136] "maiz_162"  "maiz_176"  "maiz_156"  "maiz_170"  "maiz_163" 
## [141] "maiz_177"  "maiz_178"  "maiz_192"  "maiz_185"  "maiz_201" 
## [146] "maiz_179"  "maiz_193"  "maiz_186"  "maiz_202"  "maiz_180" 
## [151] "maiz_195"  "maiz_187"  "maiz_181"  "maiz_197"  "maiz_188" 
## [156] "maiz_182"  "maiz_198"  "maiz_190"  "maiz_200"  "maiz_183" 
## [161] "maiz_191"  "teos_96"   "teos_203"  "teos_911"  "teos_9107"

##### Metadata
# load
fullmat<- read.delim(file= "../meta/maizteocintle_SNP50k_meta_extended.txt")

# check
head(fullmat)

##   OrderColecta NSiembra          Origen                  Raza     Estado
## 1            2        3  INIFAP-2009-16              Apachito  Chihuahua
## 2            1       68         2009-72              Chalque̱o   Tlaxcala
## 3            1       91         2007-33 Dulcillo del Noroeste     Sonora
## 4            2       39    Chis-2009-18            Dzit-Bacal    Chiapas
## 5            2       12 Celaya-2009-114                Celaya Guanajuato
## 6            2       41   Celaya-2009-2   Elotes Occidentales Guanajuato
##     Num_Colecta     Nombre_comun         Raza_Primaria Raza_Secundaria
## 1            16    Ocho carreras              Apachito              NA
## 2            22 Chalque̱o Criollo              Chalque̱o              NA
## 3   SON2007-033   Ma\xcc_z Dulce Dulcillo del Noroeste              NA
## 4   Repetida 18   Olotillo crema            Dzit-Bacal              NA
## 5 2009-REPO-114 Ma\xcc_z Criollo                Celaya              NA
## 6 2009-REPO-002 Ma\xcc_z Criollo   Elotes Occidentales              NA
##   A.o._de_colecta              Localidad  Municipio   Estado.1   Longitud
## 1            2009     Santo Tom\xcc\xc1s   Guerrero  Chihuahua -107.58300
## 2            2008       Ignacio Zaragoza Cuapiaxtla   Tlaxcala  -97.92944
## 3            2007            Agua Blanca      Y̩cora     Sonora -108.92467
## 4            2009 Nuevo Vicente Guerrero Villacorzo    Chiapas  -92.97972
## 5            2009            El Ahuacate  Uriangato Guanajuato -101.11889
## 6            2009              Comonfort  Comonfort Guanajuato -100.76583
##    Latitud Altitud                         Ruizetal2008_grupo
## 1 28.68578    1975                       1A_templado540-640mm
## 2 19.29083    2497                                           
## 3 28.53703    1435                     2A_semicalido500-870mm
## 4 16.03225     618                     3A_muycalido990-1360mm
## 5 20.09111    1877 2A_semicalido500-870mm y 1B_templado>650mm
## 6 20.74417    1800 2C_semicalido740-855mm y 1B_templado>650mm
##           Sanchezetal_grupo Categ.Altitud                 Rbiogeo
## 1       Sierra de Chihuahua           mid Sierra Madre Occidental
## 2                C\xcc_nico          high           Eje Volcanico
## 3                 Chapalote           mid Sierra Madre Occidental
## 4 Maduraci\xcc_n tard\xcc_a           low      Costa del Pacifico
## 5       Dentados tropicales           mid           Eje Volcanico
## 6              Ocho Hileras           mid           Eje Volcanico
##              DivFloristic     PeralesBiog
## 1 Sierra Madre Occidental Ca\xe5_ones Chi
## 2  Serranias Meridionales     Mesa Centra
## 3 Sierra Madre Occidental     Sierras del
## 4  Serranias Transismicas         Chiapas
## 5            Altiplanicie    Baj\xe5\xc1o
## 6            Altiplanicie    Baj\xe5\xc1o

nrow(fullmat)

## [1] 165

head(fullmat$NSiembra) # corresponde al número del ID de las muestras

## [1]  3 68 91 39 12 41

head(sample.id)

## [1] "maiz_3"  "maiz_68" "maiz_91" "maiz_39" "maiz_12" "maiz_41"

Realizar PCA

# PCA
pca <- snpgdsPCA(genofile, num.thread=2)

## Principal Component Analysis (PCA) on genotypes:
## Excluding 5,290 SNPs on non-autosomes
## Excluding 837 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 165 samples, 30,804 SNPs
##     using 2 (CPU) cores
## PCA:    the sum of all selected genotypes (0,1,2) = 2543593
## CPU capabilities: Double-Precision SSE2
## Thu Feb  7 12:17:44 2019    (internal increment: 2156)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed in 0s
## Thu Feb  7 12:17:44 2019    Begin (eigenvalues and eigenvectors)
## Thu Feb  7 12:17:44 2019    Done.

# Calcular el % de variación contenido por los primeros componentes
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))

## [1] 6.44 2.13 1.42 1.19 1.10 1.08

x<-round(pc.percent, 2)
sum(x[1:4])

## [1] 11.18

sum(x[1:10])

## [1] 17.08

sum(x[1:30])

## [1] 31.92

# Poner resultados en df
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

##   sample.id         EV1         EV2
## 1    maiz_3 -0.05654163  0.11522741
## 2   maiz_68 -0.09909519 -0.05460776
## 3   maiz_91 -0.01718140  0.24928583
## 4   maiz_39  0.10705139 -0.05925765
## 5   maiz_12 -0.01515252  0.01879029
## 6   maiz_41 -0.02124046  0.03231120

# Plot
ggplot(data = tab, aes(x=EV2, y=EV1)) + geom_point() +
  ylab(paste0("eigenvector 1 explaining ", round(pc.percent, 2)[1], "%")) +
  xlab(paste0("eigenvector 2 explaining ", round(pc.percent, 2)[2], "%"))

Ejercicio: repite el PCA y plot anterior pero utilizando sólo los SNPS con MAF>=0.05 (debeles seleccionarlos desde R)

## Principal Component Analysis (PCA) on genotypes:
## Excluding 5,290 SNPs on non-autosomes
## Excluding 3,633 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: NaN)
## Working space: 165 samples, 28,008 SNPs
##     using 2 (CPU) cores
## PCA:    the sum of all selected genotypes (0,1,2) = 2520895
## CPU capabilities: Double-Precision SSE2
## Thu Feb  7 12:17:45 2019    (internal increment: 2156)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed in 0s
## Thu Feb  7 12:17:45 2019    Begin (eigenvalues and eigenvectors)
## Thu Feb  7 12:17:45 2019    Done.

## [1] 6.90 2.19 1.38 1.19 1.09 0.99

## [1] 11.66

## [1] 17.35

## [1] 32.17

##   sample.id         EV1         EV2
## 1    maiz_3 -0.05604816  0.11863611
## 2   maiz_68 -0.09949387 -0.05498846
## 3   maiz_91 -0.01601032  0.24887019
## 4   maiz_39  0.10733451 -0.06104180
## 5   maiz_12 -0.01527339  0.02222495
## 6   maiz_41 -0.02034604  0.03650302

Repite el PCA coloreando los maíces por Categoría de Altitud.

# obtner info Categ.Altitud
pop_code <- as.vector(fullmat$Categ.Altitud) 

# hacer pop_codes raza coincidan con samples
tab <- data.frame(sample.id = pca$sample.id,
    pop = factor(pop_code)[match(pca$sample.id, sample.id)],
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

##   sample.id  pop         EV1         EV2
## 1    maiz_3  mid -0.05604816  0.11863611
## 2   maiz_68 high -0.09949387 -0.05498846
## 3   maiz_91  mid -0.01601032  0.24887019
## 4   maiz_39  low  0.10733451 -0.06104180
## 5   maiz_12  mid -0.01527339  0.02222495
## 6   maiz_41  mid -0.02034604  0.03650302

# Plot

ggplot(data = tab, aes(x=EV2, y=EV1, colour = pop)) + geom_point() +
  ylab(paste0("eigenvector 1 explaining ", round(pc.percent, 2)[1], "%")) +
  xlab(paste0("eigenvector 2 explaining ", round(pc.percent, 2)[2], "%"))

Ejercicio: repite el plot anterior pero coloreando en un gradiente de colores (“green” para bajo y “brown” para alto). Pista para el gradiente de color en este link. La leyenda debe llevar debe decir “Altitude (masl)”.

Debe verse así:

¿Necesitas otra pista? considera que ggplot2 solo puede hacer gradietes de variables numéricas.