I did this to get FST distances:
Code:
$ printf %s\\n Mansi Finnish Nganasan Selkup Karelian Udmurt Mordovian>pops
$ R -e 'library(admixtools);fst=fst("g/v44.3_HO_public/v44.3_HO_public",pop1=readLines("pops"));write.csv(fst,"fst",quote=F)'
ℹ Reading allele frequencies from packedancestrymap files...
ℹ v44.3_HO_public.geno has 13197 samples and 597573 SNPs
ℹ Calculating allele frequencies from 19 samples in 4 populations
ℹ Expected size of allele frequency data: 86 MB
597k SNPs read...
✔ 597573 SNPs read in total
! 593124 SNPs remain after filtering. 414780 are polymorphic.
ℹ Allele frequency matrix for 593124 SNPs and 4 populations is 62 MB
ℹ Computing pairwise f2 for all SNPs and population pairs requires 493 MB RAM without splitting
ℹ Computing without splitting since 493 < 8000 (maxmem)...
ℹ Data written to f2/
ℹ Reading precomputed data for 4 populations...
ℹ Reading f2 data for pair 10 out of 10...
Warning message:
In read_f2(dir, pops, pops2, afprod = afprod, fst = fst, remove_na = remove_na, :
Discarding 1 block(s) due to missing values!
Discarded block(s): 535
>
>
$ cat fst
,pop1,pop2,est,se,z,p
1,Finnish,Karelian,0.00129340940098996,0.000385024618276676,3.35929013261311,0.000781429796323533
2,Finnish,Mordovian,0.00543917401762932,0.0003253493038913,16.7179519137578,9.6995724807741e-63
3,Finnish,Nganasan,0.119054350470445,0.00113575044283692,104.824392736371,0
4,Finnish,Selkup,0.0601437871347565,0.000773515188052884,77.753854175963,0
5,Finnish,Udmurt,0.0187032075983067,0.000585527652038594,31.9424839001009,6.87035668325847e-224
6,Karelian,Mordovian,0.00590771927078587,0.000239357005605168,24.6816225656289,1.68484078541802e-134
7,Karelian,Udmurt,0.019523384287915,0.000473936956593016,41.1940533784545,0
8,Mansi,Finnish,0.0402190424166203,0.000841540850181047,47.7921450966614,0
9,Mansi,Karelian,0.0399509729801598,0.000728931940431647,54.8075489139662,0
10,Mansi,Mordovian,0.0383778238793512,0.000668216221602333,57.4332418739013,0
11,Mansi,Nganasan,0.0602924170396429,0.000867560662779261,69.4964855212476,0
12,Mansi,Selkup,0.0223050689999093,0.000513096315174833,43.4715049401769,0
13,Mansi,Udmurt,0.0240652073778455,0.000663882926915772,36.2491734644333,1.02394799408991e-287
14,Nganasan,Karelian,0.118602793551424,0.00108015804818372,109.801333009418,0
15,Nganasan,Mordovian,0.11745770899229,0.00099405063691636,118.160689838352,0
16,Nganasan,Selkup,0.0504386703379596,0.000674035384985417,74.8308938395731,0
17,Nganasan,Udmurt,0.0911528579608077,0.000973329536820952,93.650561821567,0
18,Selkup,Karelian,0.0595410382563537,0.00071504905760348,83.268466160781,0
19,Selkup,Mordovian,0.0579452329275561,0.000631133692337951,91.8113446184526,0
20,Selkup,Udmurt,0.0409818006630253,0.000612173976633259,66.9446958337084,0
21,Udmurt,Mordovian,0.0170968626475659,0.000406949757311595,42.0122197897642,0
There's probably an easier way to do this in R, but this converts the FST pairs into a table:
Code:
$ awk -F, 'NR>1{print$3","$2","$4;print$2","$3","$4}' fst|awk -F, '{print$1","$1","}1'|sort -u>/tmp/a
$ cut -d, -f3 /tmp/a|awk '{printf"%.6f"(NR%n?",":"\n"),$0}' n=$(awk 'END{print NR^.5}' /tmp/a) -|paste -d, <(cut -d, -f1 /tmp/a|sort -u) -|cat <(cut -d, -f1 /tmp/a|sort -u|paste -sd, -|sed s/^/,/) ->/tmp/b
$ cat /tmp/b
,Finnish,Karelian,Mansi,Mordovian,Nganasan,Selkup,Udmurt
Finnish,0.000000,0.001293,0.040219,0.005439,0.119054,0.060144,0.018703
Karelian,0.001293,0.000000,0.039951,0.005908,0.118603,0.059541,0.019523
Mansi,0.040219,0.039951,0.000000,0.038378,0.060292,0.022305,0.024065
Mordovian,0.005439,0.005908,0.038378,0.000000,0.117458,0.057945,0.017097
Nganasan,0.119054,0.118603,0.060292,0.117458,0.000000,0.050439,0.091153
Selkup,0.060144,0.059541,0.022305,0.057945,0.050439,0.000000,0.040982
Udmurt,0.018703,0.019523,0.024065,0.017097,0.091153,0.040982,0.000000
And this creates a heatmap of the table:
Code:
R -e 'install.packages(c("pheatmap","colorspace"),repos="https://cloud.r-project.org")'
R -e 'library(pheatmap)
library(colorspace)
t<-read.csv("/tmp/b",header=T,row.names=1,check.names=F)
t[t==0]=NA
pheatmap(
1e4*t,
filename="/tmp/a.png",
legend=F,
clustering_callback=function(...){hclust(as.dist(t))},
cellwidth=18,
cellheight=12,
border_color=NA,
display_numbers=T,
number_format="%.0f",
number_color="black",
fontsize_number=6,
colorRampPalette(hex(HSV(c(210,180,150,120,90,60,30,0),.5,1)))(256)
)'
At first I got an error that there were too many missing blocks, so I tried adding a `maxmiss=Inf` parameter:
Code:
R -e 'library("admixtools");extract_f2(pref="g/v44.3_HO_public/v44.3_HO_public",pops=c("Finnish","Mansi","Mari.SG","Estonian.DG"),outdir="f2",maxmiss=Inf);f2=f2_from_precomp("f2");fst=fst(f2);write.csv(fst,"fst",quote=F)'
However it gave me nonsensical results where the distance between Finns and Maris was an order of magnitude bigger than the distance between Finns and Mansi:
Code:
,pop1,pop2,est,se,z,p
1,Estonian.DG,Finnish,0.000904946578981571,0.000400460532337746,2.25976471064156,0.0238358576870844
2,Estonian.DG,Mansi,0.015818211648642,0.00055465784565136,28.5188639675077,6.83699132061968e-179
3,Estonian.DG,Mari.SG,0.174033745411937,0.00101794728786578,170.96538051279,0
4,Finnish,Mansi,0.0139136259691746,0.000307938735432869,45.1830977016134,0
5,Finnish,Mari.SG,0.17331351350787,0.000760056331041605,228.027195392683,0
6,Mansi,Mari.SG,0.17490109671494,0.000788482478735399,221.819890018931,0
Bookmarks