AndreiDNA
03-05-2025, 11:16 PM
Code:
import pandas as pd
import re
# Load GWAS summary statistics file
def load_gwas(file):
gwas_df = pd.read_csv(file, sep='\t') # Adjust separator if needed
# Split the "SNP-RISK ALLELE" column into "SNP" and "Risk_Allele"
gwas_df[['SNP', 'Risk_Allele']] = gwas_df['STRONGEST SNP-RISK ALLELE'].str.split('-', expand=True)
# Keep only relevant columns
gwas_df = gwas_df[['SNP', 'Risk_Allele', 'OR or BETA']].rename(columns={'OR or BETA': 'Beta'})
return gwas_df.set_index('SNP')
def load_genotypes(file):
geno_df = pd.read_csv(file, sep=',', comment='#', skip_blank_lines=True) # Skip comment lines
# Print column names for debugging
print("Genotype Column Names:", geno_df.columns.tolist())
# Ensure 'RSID' is recognized properly
if 'RSID' not in geno_df.columns:
raise KeyError("The column 'RSID' is missing. Check if the file format is correct.")
return geno_df.set_index('RSID')
# Calculate polygenic risk scores
def calculate_beta(geno_df, gwas_df):
common_snps = geno_df.index.intersection(gwas_df.index)
gwas_filtered = gwas_df.loc[common_snps]
geno_filtered = geno_df.loc[common_snps]
# Convert genotype data to counts of the risk allele
def count_risk_alleles(row):
snp = row.name # SNP ID
# Ensure we extract a single value for Risk_Allele
risk_allele = gwas_filtered.loc[snp, 'Risk_Allele']
if isinstance(risk_allele, pd.Series):
risk_allele = risk_allele.iloc[0] # Take the first value if there are duplicates
# If the risk allele is empty or contains special characters, return 0 count
if not risk_allele or not isinstance(risk_allele, str):
return 0
# Safely count occurrences of risk allele in genotype
try:
return row.str.count(re.escape(risk_allele)) # Use re.escape to safely handle special characters
except re.error:
print(f"Error with SNP {snp} and risk allele {risk_allele}")
return 0
# Apply the count_risk_alleles function to each row of genotype data
risk_allele_counts = geno_filtered.apply(count_risk_alleles, axis=1)
# Compute weighted sum of risk allele counts by beta
beta_scores = (risk_allele_counts.multiply(gwas_filtered['Beta'], axis=0)).sum()
return beta_scores
if __name__ == "__main__":
gwas_file = input("Enter the GWAS dataset name: ")
geno_file = input("Enter the raw DNA file name: ")
gwas_df = load_gwas(gwas_file)
geno_df = load_genotypes(geno_file)
beta_scores = calculate_beta(geno_df, gwas_df)
print(beta_scores)
Go to https://www.ebi.ac.uk/gwas/
Search up an illness
Click on the Trait
Find "Download Associations" button in bottom right of the page
Then
Go to google colab
Create a tab and add the code to it
Upload your raw DNA file (Has to be in Myheritage format) to the work directory
Upload the GWAS associations to same directory
Run the code cell
Input the name of the GWAS associations .tsv doc
Input the name of your Myheritage DNA file
Result will come in a couple seconds, depending on how large the GWAS dataset is
Here is a sample result (My mom, trait is Bipolar disorder type 1):
137835
You can compare your genetic predispositions with your siblings, other people, whoever. Higher number means higher predisposition.
It won't work with super large datasets so try to limit your dataset size by choosing specific studies when downloading the associations
import pandas as pd
import re
# Load GWAS summary statistics file
def load_gwas(file):
gwas_df = pd.read_csv(file, sep='\t') # Adjust separator if needed
# Split the "SNP-RISK ALLELE" column into "SNP" and "Risk_Allele"
gwas_df[['SNP', 'Risk_Allele']] = gwas_df['STRONGEST SNP-RISK ALLELE'].str.split('-', expand=True)
# Keep only relevant columns
gwas_df = gwas_df[['SNP', 'Risk_Allele', 'OR or BETA']].rename(columns={'OR or BETA': 'Beta'})
return gwas_df.set_index('SNP')
def load_genotypes(file):
geno_df = pd.read_csv(file, sep=',', comment='#', skip_blank_lines=True) # Skip comment lines
# Print column names for debugging
print("Genotype Column Names:", geno_df.columns.tolist())
# Ensure 'RSID' is recognized properly
if 'RSID' not in geno_df.columns:
raise KeyError("The column 'RSID' is missing. Check if the file format is correct.")
return geno_df.set_index('RSID')
# Calculate polygenic risk scores
def calculate_beta(geno_df, gwas_df):
common_snps = geno_df.index.intersection(gwas_df.index)
gwas_filtered = gwas_df.loc[common_snps]
geno_filtered = geno_df.loc[common_snps]
# Convert genotype data to counts of the risk allele
def count_risk_alleles(row):
snp = row.name # SNP ID
# Ensure we extract a single value for Risk_Allele
risk_allele = gwas_filtered.loc[snp, 'Risk_Allele']
if isinstance(risk_allele, pd.Series):
risk_allele = risk_allele.iloc[0] # Take the first value if there are duplicates
# If the risk allele is empty or contains special characters, return 0 count
if not risk_allele or not isinstance(risk_allele, str):
return 0
# Safely count occurrences of risk allele in genotype
try:
return row.str.count(re.escape(risk_allele)) # Use re.escape to safely handle special characters
except re.error:
print(f"Error with SNP {snp} and risk allele {risk_allele}")
return 0
# Apply the count_risk_alleles function to each row of genotype data
risk_allele_counts = geno_filtered.apply(count_risk_alleles, axis=1)
# Compute weighted sum of risk allele counts by beta
beta_scores = (risk_allele_counts.multiply(gwas_filtered['Beta'], axis=0)).sum()
return beta_scores
if __name__ == "__main__":
gwas_file = input("Enter the GWAS dataset name: ")
geno_file = input("Enter the raw DNA file name: ")
gwas_df = load_gwas(gwas_file)
geno_df = load_genotypes(geno_file)
beta_scores = calculate_beta(geno_df, gwas_df)
print(beta_scores)
Go to https://www.ebi.ac.uk/gwas/
Search up an illness
Click on the Trait
Find "Download Associations" button in bottom right of the page
Then
Go to google colab
Create a tab and add the code to it
Upload your raw DNA file (Has to be in Myheritage format) to the work directory
Upload the GWAS associations to same directory
Run the code cell
Input the name of the GWAS associations .tsv doc
Input the name of your Myheritage DNA file
Result will come in a couple seconds, depending on how large the GWAS dataset is
Here is a sample result (My mom, trait is Bipolar disorder type 1):
137835
You can compare your genetic predispositions with your siblings, other people, whoever. Higher number means higher predisposition.
It won't work with super large datasets so try to limit your dataset size by choosing specific studies when downloading the associations