Yeah I figured it out already. I did something like this to make a global PCA of modern individuals in the Reich dataset:
Code:
wget reichdata.hms.harvard.edu/pub/datasets/amh_repo/curated_releases/V44/V44.3/SHARE/public.dir/v44.3_1240K_public.tar
tar -xf v44.3_1240K_public.tar
f=v44.3_1240K_public;convertf -p <(printf %s\\n genotypename:\ $f.geno snpname:\ $f.snp indivname:\ $f.ind outputformat:\ PACKEDPED genotypeoutname:\ $f.bed snpoutname:\ $f.bim indivoutname:\ $f.fam)
sed 1d v44.3_1240K_public.anno|grep -v 1KGPhase|awk -F\\t '$9=="Modern"{print$2,$13}'|grep -Ev '_dup|Ignore_|\.REF|_o'|sed -E 's/\.(SDG|DG|SG)$//'>picks0
awk 'NR==FNR{a[$1];next}$2 in a' picks0 v44.3_1240K_public.fam>picks
plink --bfile v44.3_1240K_public --keep picks --allow-no-sex --make-bed --out picks
plink --bfile picks --pca --geno .001 --allow-no-sex --out picks
paste -d' ' <(cut -d' ' -f2 picks0) <(cut -d' ' -f2- picks.eigenvec)>picks.eigenvec.2
When I didn't add `--geno .001`, it fked up the clustering of some populations at first, so that for example one South Asian population clustered together with Africans.
I couldn't get convertf to compile, but I downloaded a Mac binary from here: https://github.com/chrchang/eigensoft. The Mac binaries for plink from Harvard's website didn't work, but there were working binaries by another maintainer here: https://www.cog-genomics.org/plink/1.9/. You can download the v44.3_1240K_public.tar file manually from here: https://reichdata.hms.harvard.edu/pu...ses/index.html.
I then made this in R:
Code:
libarary(tidyverse)
library(colorspace)
f="picks"
t=read.table(paste0(f,".eigenvec.2"),sep=" ")
eig=as.double(readLines(paste0(f,".eigenval")))
# t=cbind(t[,c(1,2)],t(t(t[,-c(1,2)])*sqrt(eig))) # I think this corresponds to scaling in G25
pct=paste0("PC",seq(length(eig))," (",sprintf("%.1f",100*eig/sum(eig)),"%)")
ave=aggregate(t[,-c(1,2)],list(t[,1]),mean)
names(ave)=c("pop",paste0("PC",seq(ncol(ave)-1)))
k=cutree(hclust(dist(ave[,-1]),method="ward.D2"),k=12)
write.csv(k,"/tmp/k",quote=F)
ave$k=k
ggplot(ave,aes(x=PC1,y=PC2,label=pop))+
geom_point(aes(color=as.factor(k)),size=.5)+
geom_polygon(data=ave%>%group_by(k)%>%slice(chull(PC1,PC2)),alpha=.2,aes(color=as.factor(k),fill=as.factor(k)),size=.3)+
geom_text(aes(label=pop,color=as.factor(k)),size=2,vjust=-.7)+
theme(
aspect.ratio=3/4,
axis.text=element_text(color="black",size=7),
axis.ticks.length=unit(0,"pt"),
axis.ticks.x=element_blank(),
axis.ticks.y=element_blank(),
axis.title=element_text(color="black",size=10),
legend.position="none",
panel.background=element_rect(fill="white"),
panel.grid.major=element_line(color="gray75",size=.2)
)+
scale_x_continuous(breaks=seq(-2,2,.1),expand=expansion(mult=.07))+
scale_y_continuous(breaks=seq(-2,2,.1),expand=expansion(mult=.06))+
labs(x=pct[1],y=pct[2])+
scale_color_discrete_qualitative(palette="Set 2",c=80,l=40)
ggsave("output.png")
system("/usr/local/bin/mogrify -trim -bordercolor white -border 20x20 output.png")
However there's something wrong with the distances in my PCA. For example Finns have about 5 times bigger distance to Khomani_San than to Yoruba:
Code:
$ tav(){ awk '{n[$1]++;for(i=2;i<=NF;i++){a[$1][i]+=$i}}END{for(i in a){o=i;for(j=2;j<=NF;j++)o=o FS sprintf("%f",a[i][j]/n[i]);print o}}' "FS=${1-$'\t'}";}
$ dist(){ awk -F, 'NR==FNR{for(i=2;i<=NF;i++)a[i]=$i;next}$1{s=0;for(i=2;i<=NF;i++)s+=($i-a[i])^2;print s^.5,$1}' "$2" "$1"|sort -n|awk '{printf"%.3f %s\n",$1,$2}'|sed s,^0,,;}
$ paste -d' ' <(cut -d' ' -f2 maailma0) <(cut -d' ' -f3- maailma.eigenvec)|tav ' '|tr ' ' ,>ave;dist ave <(grep Finnish ave)|tail -n16
.130 Karitiana
.130 Mende
.135 Gambian
.136 Esan
.155 Yoruba
.174 Papuan
.189 Mandenka
.206 BantuKenya
.216 Biaka
.242 BantuSA
.268 Mbuti
.336 Ju_hoan_North
.492 BantuHerero
.492 BantuSA_Herero
.497 BantuTswana
.705 Khomani_San
Am I supposed to apply some further quality control or filtering? I tried to include only samples with few missing SNPs: `awk -F\\t '$21>9e5' v44.3_1240K_public.anno`. I also tried increasing the value of the `--geno` option and I tried adding an option like `--max-maf .3`. None of it helped however.
I also tried multiplying the columns of the table with the square roots of the eigenvalues but it didn't help:
Code:
f="picks"
t=read.table(paste0(f,".eigenvec.2"),sep=" ")
eig=as.double(readLines(paste0(f,".eigenval")))
t2=(cbind(t[,c(1,2)],t(t(t[,-c(1,2)])*sqrt(eig))))
ave=aggregate(t[,-c(1,2)],list(t[,1]),mean)
ave2=aggregate(t2[,-c(1,2)],list(t2[,1]),mean)
ind=cbind(paste0(t[,1],":",t[,2]),t[,-c(1,2)])
ind2=cbind(paste0(t2[,1],":",t2[,2]),t2[,-c(1,2)])
write.table(ind,paste0(f,".ind"),quote=F,sep=",",col.names=F,row.names=F)
write.table(ind2,paste0(f,".indscaled"),quote=F,sep=",",col.names=F,row.names=F)
write.table(ave,paste0(f,".ave"),quote=F,sep=",",col.names=F,row.names=F)
write.table(ave2,paste0(f,".avescaled"),quote=F,sep=",",col.names=F,row.names=F)
Bookmarks