#source("http://bioconductor.org/biocLite.R")
#biocLite("gdsfmt")
#biocLite("SNPRelate")
library(SNPRelate)
## Loading required package: gdsfmt
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
library(ape)
library(ggplot2)
##### Para usar con SNPRelate
## Crear datos en formato gds a partir de plink
snpgdsBED2GDS("../data/maicesArtegaetal2015.bed",
"../data/maicesArtegaetal2015.fam",
"../data/maicesArtegaetal2015.bim",
out.gdsfn="../data/maicesArtegaetal2015.gds",
option = snpgdsOption(Z=10)) # 10 cromosomas
## Start snpgdsBED2GDS ...
## BED file: "../data/maicesArtegaetal2015.bed" in the SNP-major mode (Sample X SNP)
## FAM file: "../data/maicesArtegaetal2015.fam", DONE.
## BIM file: "../data/maicesArtegaetal2015.bim", DONE.
## Thu Feb 7 12:17:43 2019 store sample id, snp id, position, and chromosome.
## start writing: 165 samples, 36931 SNPs ...
## Thu Feb 7 12:17:43 2019 0%
## Thu Feb 7 12:17:44 2019 100%
## Thu Feb 7 12:17:44 2019 Done.
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../data/maicesArtegaetal2015.gds' (1.6M)
## # of fragments: 39
## save to '../data/maicesArtegaetal2015.gds.tmp'
## rename '../data/maicesArtegaetal2015.gds.tmp' (1.6M, reduced: 252B)
## # of fragments: 18
# Ver resumen (esto no carga el archivo)
snpgdsSummary("../data/maicesArtegaetal2015.gds")
## Some values of 'snp.chromosome' are invalid (should be finite and >= 1)!
## Hint: specifying 'autosome.only=FALSE' in the analysis could avoid detecting chromosome coding.
## Some of 'snp.allele' are not standard (e.g., 0/G).
## The file name: /Users/ticatla/hubiC/Science/Teaching/Mx/BioinfInvgRepro/BioinfinvRepro/Unidad5/Prac_Uni5/maices/data/maicesArtegaetal2015.gds
## The total number of samples: 165
## The total number of SNPs: 36931
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
## The number of valid samples: 165
## The number of biallelic unique SNPs: 30804
# Cargar archivo para trabajar con el
genofile <- snpgdsOpen("../data/maicesArtegaetal2015.gds")
# Check snp.ids
head(read.gdsn(index.gdsn(genofile, "snp.id")))
## [1] "abph1.15" "ae1.8" "an1.3" "ba1.5" "ba1.7" "csu1138.4"
# Check sample.ids
head(read.gdsn(index.gdsn(genofile, "sample.id")))
## [1] "maiz_3" "maiz_68" "maiz_91" "maiz_39" "maiz_12" "maiz_41"
# Obtener nombres muestras del gdsn
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
sample.id
## [1] "maiz_3" "maiz_68" "maiz_91" "maiz_39" "maiz_12"
## [6] "maiz_41" "maiz_35" "maiz_58" "maiz_51" "maiz_82"
## [11] "maiz_67" "maiz_93" "maiz_21" "maiz_6" "maiz_101"
## [16] "maiz_27" "maiz_43" "maiz_1" "maiz_33" "maiz_100"
## [21] "maiz_24" "maiz_103" "maiz_72" "maiz_10" "maiz_28"
## [26] "maiz_49" "maiz_56" "maiz_66" "maiz_52" "maiz_47"
## [31] "maiz_80" "maiz_65" "maiz_94" "maiz_36" "maiz_26"
## [36] "maiz_105" "maiz_30" "maiz_16" "maiz_42" "maiz_4"
## [41] "maiz_31" "maiz_17" "maiz_46" "maiz_5" "maiz_32"
## [46] "maiz_19" "maiz_50" "maiz_8" "maiz_34" "maiz_23"
## [51] "maiz_54" "maiz_14" "maiz_37" "maiz_25" "maiz_55"
## [56] "maiz_40" "maiz_29" "maiz_60" "maiz_44" "maiz_74"
## [61] "maiz_89" "maiz_64" "maiz_83" "maiz_75" "maiz_92"
## [66] "maiz_69" "maiz_84" "maiz_76" "maiz_97" "maiz_70"
## [71] "maiz_85" "maiz_77" "maiz_71" "maiz_86" "maiz_78"
## [76] "maiz_102" "maiz_73" "maiz_88" "maiz_79" "maiz_106"
## [81] "maiz_119" "maiz_148" "maiz_108" "maiz_120" "maiz_151"
## [86] "maiz_134" "maiz_123" "maiz_153" "maiz_135" "maiz_124"
## [91] "maiz_184" "maiz_140" "maiz_125" "maiz_141" "maiz_131"
## [96] "maiz_189" "maiz_57" "maiz_126" "maiz_113" "maiz_138"
## [101] "maiz_63" "maiz_127" "maiz_114" "maiz_139" "maiz_99"
## [106] "maiz_129" "maiz_116" "maiz_142" "maiz_110" "maiz_132"
## [111] "maiz_118" "maiz_144" "maiz_133" "maiz_121" "maiz_111"
## [116] "maiz_137" "maiz_109" "maiz_146" "maiz_164" "maiz_157"
## [121] "maiz_171" "maiz_149" "maiz_165" "maiz_159" "maiz_172"
## [126] "maiz_150" "maiz_166" "maiz_160" "maiz_173" "maiz_152"
## [131] "maiz_167" "maiz_161" "maiz_174" "maiz_154" "maiz_169"
## [136] "maiz_162" "maiz_176" "maiz_156" "maiz_170" "maiz_163"
## [141] "maiz_177" "maiz_178" "maiz_192" "maiz_185" "maiz_201"
## [146] "maiz_179" "maiz_193" "maiz_186" "maiz_202" "maiz_180"
## [151] "maiz_195" "maiz_187" "maiz_181" "maiz_197" "maiz_188"
## [156] "maiz_182" "maiz_198" "maiz_190" "maiz_200" "maiz_183"
## [161] "maiz_191" "teos_96" "teos_203" "teos_911" "teos_9107"
##### Metadata
# load
fullmat<- read.delim(file= "../meta/maizteocintle_SNP50k_meta_extended.txt")
# check
head(fullmat)
## OrderColecta NSiembra Origen Raza Estado
## 1 2 3 INIFAP-2009-16 Apachito Chihuahua
## 2 1 68 2009-72 Chalque̱o Tlaxcala
## 3 1 91 2007-33 Dulcillo del Noroeste Sonora
## 4 2 39 Chis-2009-18 Dzit-Bacal Chiapas
## 5 2 12 Celaya-2009-114 Celaya Guanajuato
## 6 2 41 Celaya-2009-2 Elotes Occidentales Guanajuato
## Num_Colecta Nombre_comun Raza_Primaria Raza_Secundaria
## 1 16 Ocho carreras Apachito NA
## 2 22 Chalque̱o Criollo Chalque̱o NA
## 3 SON2007-033 Ma\xcc_z Dulce Dulcillo del Noroeste NA
## 4 Repetida 18 Olotillo crema Dzit-Bacal NA
## 5 2009-REPO-114 Ma\xcc_z Criollo Celaya NA
## 6 2009-REPO-002 Ma\xcc_z Criollo Elotes Occidentales NA
## A.o._de_colecta Localidad Municipio Estado.1 Longitud
## 1 2009 Santo Tom\xcc\xc1s Guerrero Chihuahua -107.58300
## 2 2008 Ignacio Zaragoza Cuapiaxtla Tlaxcala -97.92944
## 3 2007 Agua Blanca Y̩cora Sonora -108.92467
## 4 2009 Nuevo Vicente Guerrero Villacorzo Chiapas -92.97972
## 5 2009 El Ahuacate Uriangato Guanajuato -101.11889
## 6 2009 Comonfort Comonfort Guanajuato -100.76583
## Latitud Altitud Ruizetal2008_grupo
## 1 28.68578 1975 1A_templado540-640mm
## 2 19.29083 2497
## 3 28.53703 1435 2A_semicalido500-870mm
## 4 16.03225 618 3A_muycalido990-1360mm
## 5 20.09111 1877 2A_semicalido500-870mm y 1B_templado>650mm
## 6 20.74417 1800 2C_semicalido740-855mm y 1B_templado>650mm
## Sanchezetal_grupo Categ.Altitud Rbiogeo
## 1 Sierra de Chihuahua mid Sierra Madre Occidental
## 2 C\xcc_nico high Eje Volcanico
## 3 Chapalote mid Sierra Madre Occidental
## 4 Maduraci\xcc_n tard\xcc_a low Costa del Pacifico
## 5 Dentados tropicales mid Eje Volcanico
## 6 Ocho Hileras mid Eje Volcanico
## DivFloristic PeralesBiog
## 1 Sierra Madre Occidental Ca\xe5_ones Chi
## 2 Serranias Meridionales Mesa Centra
## 3 Sierra Madre Occidental Sierras del
## 4 Serranias Transismicas Chiapas
## 5 Altiplanicie Baj\xe5\xc1o
## 6 Altiplanicie Baj\xe5\xc1o
nrow(fullmat)
## [1] 165
head(fullmat$NSiembra) # corresponde al número del ID de las muestras
## [1] 3 68 91 39 12 41
head(sample.id)
## [1] "maiz_3" "maiz_68" "maiz_91" "maiz_39" "maiz_12" "maiz_41"
# PCA
pca <- snpgdsPCA(genofile, num.thread=2)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 5,290 SNPs on non-autosomes
## Excluding 837 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 165 samples, 30,804 SNPs
## using 2 (CPU) cores
## PCA: the sum of all selected genotypes (0,1,2) = 2543593
## CPU capabilities: Double-Precision SSE2
## Thu Feb 7 12:17:44 2019 (internal increment: 2156)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Thu Feb 7 12:17:44 2019 Begin (eigenvalues and eigenvectors)
## Thu Feb 7 12:17:44 2019 Done.
# Calcular el % de variación contenido por los primeros componentes
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
## [1] 6.44 2.13 1.42 1.19 1.10 1.08
x<-round(pc.percent, 2)
sum(x[1:4])
## [1] 11.18
sum(x[1:10])
## [1] 17.08
sum(x[1:30])
## [1] 31.92
# Poner resultados en df
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
## sample.id EV1 EV2
## 1 maiz_3 -0.05654163 0.11522741
## 2 maiz_68 -0.09909519 -0.05460776
## 3 maiz_91 -0.01718140 0.24928583
## 4 maiz_39 0.10705139 -0.05925765
## 5 maiz_12 -0.01515252 0.01879029
## 6 maiz_41 -0.02124046 0.03231120
# Plot
ggplot(data = tab, aes(x=EV2, y=EV1)) + geom_point() +
ylab(paste0("eigenvector 1 explaining ", round(pc.percent, 2)[1], "%")) +
xlab(paste0("eigenvector 2 explaining ", round(pc.percent, 2)[2], "%"))
Ejercicio: repite el PCA y plot anterior pero utilizando sólo los SNPS con MAF>=0.05 (debeles seleccionarlos desde R)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 5,290 SNPs on non-autosomes
## Excluding 3,633 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: NaN)
## Working space: 165 samples, 28,008 SNPs
## using 2 (CPU) cores
## PCA: the sum of all selected genotypes (0,1,2) = 2520895
## CPU capabilities: Double-Precision SSE2
## Thu Feb 7 12:17:45 2019 (internal increment: 2156)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Thu Feb 7 12:17:45 2019 Begin (eigenvalues and eigenvectors)
## Thu Feb 7 12:17:45 2019 Done.
## [1] 6.90 2.19 1.38 1.19 1.09 0.99
## [1] 11.66
## [1] 17.35
## [1] 32.17
## sample.id EV1 EV2
## 1 maiz_3 -0.05604816 0.11863611
## 2 maiz_68 -0.09949387 -0.05498846
## 3 maiz_91 -0.01601032 0.24887019
## 4 maiz_39 0.10733451 -0.06104180
## 5 maiz_12 -0.01527339 0.02222495
## 6 maiz_41 -0.02034604 0.03650302
Repite el PCA coloreando los maíces por Categoría de Altitud.
# obtner info Categ.Altitud
pop_code <- as.vector(fullmat$Categ.Altitud)
# hacer pop_codes raza coincidan con samples
tab <- data.frame(sample.id = pca$sample.id,
pop = factor(pop_code)[match(pca$sample.id, sample.id)],
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
## sample.id pop EV1 EV2
## 1 maiz_3 mid -0.05604816 0.11863611
## 2 maiz_68 high -0.09949387 -0.05498846
## 3 maiz_91 mid -0.01601032 0.24887019
## 4 maiz_39 low 0.10733451 -0.06104180
## 5 maiz_12 mid -0.01527339 0.02222495
## 6 maiz_41 mid -0.02034604 0.03650302
# Plot
ggplot(data = tab, aes(x=EV2, y=EV1, colour = pop)) + geom_point() +
ylab(paste0("eigenvector 1 explaining ", round(pc.percent, 2)[1], "%")) +
xlab(paste0("eigenvector 2 explaining ", round(pc.percent, 2)[2], "%"))
Ejercicio: repite el plot anterior pero coloreando en un gradiente de colores (“green” para bajo y “brown” para alto). Pista para el gradiente de color en este link. La leyenda debe llevar debe decir “Altitude (masl)”.
Debe verse así:
¿Necesitas otra pista? considera que ggplot2 solo puede hacer gradietes de variables numéricas.