Log in

View Full Version : Check your health predispositions with GWAS for free



AndreiDNA
03-05-2025, 11:16 PM
Code:


import pandas as pd
import re

# Load GWAS summary statistics file
def load_gwas(file):
gwas_df = pd.read_csv(file, sep='\t') # Adjust separator if needed

# Split the "SNP-RISK ALLELE" column into "SNP" and "Risk_Allele"
gwas_df[['SNP', 'Risk_Allele']] = gwas_df['STRONGEST SNP-RISK ALLELE'].str.split('-', expand=True)

# Keep only relevant columns
gwas_df = gwas_df[['SNP', 'Risk_Allele', 'OR or BETA']].rename(columns={'OR or BETA': 'Beta'})

return gwas_df.set_index('SNP')

def load_genotypes(file):
geno_df = pd.read_csv(file, sep=',', comment='#', skip_blank_lines=True) # Skip comment lines

# Print column names for debugging
print("Genotype Column Names:", geno_df.columns.tolist())

# Ensure 'RSID' is recognized properly
if 'RSID' not in geno_df.columns:
raise KeyError("The column 'RSID' is missing. Check if the file format is correct.")

return geno_df.set_index('RSID')

# Calculate polygenic risk scores
def calculate_beta(geno_df, gwas_df):
common_snps = geno_df.index.intersection(gwas_df.index)

gwas_filtered = gwas_df.loc[common_snps]
geno_filtered = geno_df.loc[common_snps]

# Convert genotype data to counts of the risk allele
def count_risk_alleles(row):
snp = row.name # SNP ID

# Ensure we extract a single value for Risk_Allele
risk_allele = gwas_filtered.loc[snp, 'Risk_Allele']
if isinstance(risk_allele, pd.Series):
risk_allele = risk_allele.iloc[0] # Take the first value if there are duplicates

# If the risk allele is empty or contains special characters, return 0 count
if not risk_allele or not isinstance(risk_allele, str):
return 0

# Safely count occurrences of risk allele in genotype
try:
return row.str.count(re.escape(risk_allele)) # Use re.escape to safely handle special characters
except re.error:
print(f"Error with SNP {snp} and risk allele {risk_allele}")
return 0

# Apply the count_risk_alleles function to each row of genotype data
risk_allele_counts = geno_filtered.apply(count_risk_alleles, axis=1)

# Compute weighted sum of risk allele counts by beta
beta_scores = (risk_allele_counts.multiply(gwas_filtered['Beta'], axis=0)).sum()
return beta_scores



if __name__ == "__main__":
gwas_file = input("Enter the GWAS dataset name: ")
geno_file = input("Enter the raw DNA file name: ")

gwas_df = load_gwas(gwas_file)
geno_df = load_genotypes(geno_file)

beta_scores = calculate_beta(geno_df, gwas_df)
print(beta_scores)


Go to https://www.ebi.ac.uk/gwas/
Search up an illness
Click on the Trait
Find "Download Associations" button in bottom right of the page

Then

Go to google colab
Create a tab and add the code to it

Upload your raw DNA file (Has to be in Myheritage format) to the work directory
Upload the GWAS associations to same directory
Run the code cell
Input the name of the GWAS associations .tsv doc
Input the name of your Myheritage DNA file
Result will come in a couple seconds, depending on how large the GWAS dataset is

Here is a sample result (My mom, trait is Bipolar disorder type 1):
137835
You can compare your genetic predispositions with your siblings, other people, whoever. Higher number means higher predisposition.
It won't work with super large datasets so try to limit your dataset size by choosing specific studies when downloading the associations

AndreiDNA
03-06-2025, 03:12 AM
import pandas as pd
import re

# Load GWAS summary statistics file
def load_gwas(file):
gwas_df = pd.read_csv(file, sep='\t', dtype=str) # Ensure all data is read as string

# Handle cases where "STRONGEST SNP-RISK ALLELE" may be missing or malformed
if 'STRONGEST SNP-RISK ALLELE' in gwas_df.columns:
gwas_df[['SNP', 'Risk_Allele']] = gwas_df['STRONGEST SNP-RISK ALLELE'].str.split('-', n=1, expand=True)
gwas_df['Risk_Allele'] = gwas_df['Risk_Allele'].fillna('') # Ensure Risk_Allele is never NaN
else:
print("Warning: 'STRONGEST SNP-RISK ALLELE' column missing in GWAS file")
return pd.DataFrame() # Return an empty DataFrame to prevent crashes

# Keep only relevant columns
if 'OR or BETA' in gwas_df.columns:
gwas_df = gwas_df[['SNP', 'Risk_Allele', 'OR or BETA']].rename(columns={'OR or BETA': 'Beta'})
else:
print("Warning: 'OR or BETA' column missing in GWAS file")
return pd.DataFrame()

# Convert Beta to numeric, skipping errors
gwas_df['Beta'] = pd.to_numeric(gwas_df['Beta'], errors='coerce')

return gwas_df.dropna(subset=['SNP']).set_index('SNP') # Drop rows where SNP is missing

# Load microarray genotype data (MyHeritage format is CSV, not TSV)
def load_genotypes(file):
geno_df = pd.read_csv(file, sep=',', comment='#', skip_blank_lines=True, dtype=str) # Read all columns as strings

# Ensure 'RSID' exists
if 'RSID' not in geno_df.columns:
raise KeyError("The column 'RSID' is missing. Check if the file format is correct.")

# Drop rows with missing RSID values
geno_df = geno_df.dropna(subset=['RSID'])

# Convert genotypes to uppercase for consistency
geno_df['RESULT'] = geno_df['RESULT'].str.upper()

return geno_df.set_index('RSID')

# Calculate polygenic risk scores
def calculate_beta(geno_df, gwas_df):
common_snps = geno_df.index.intersection(gwas_df.index)

if common_snps.empty:
print("No matching SNPs found between GWAS and genotype data.")
return 0.0 # Return 0 if no SNPs are shared

gwas_filtered = gwas_df.loc[common_snps]
geno_filtered = geno_df.loc[common_snps]

# Convert genotype data to counts of the risk allele
def count_risk_alleles(row):
snp = row.name # SNP ID
risk_allele = gwas_filtered.at[snp, 'Risk_Allele']

if not isinstance(risk_allele, str) or risk_allele == '':
return 0 # Skip problematic SNPs

try:
return row['RESULT'].count(risk_allele) # Count risk alleles
except Exception as e:
print(f"Skipping SNP {snp} due to error: {e}")
return 0

# Apply the count function
risk_allele_counts = geno_filtered.apply(count_risk_alleles, axis=1)

# Compute weighted sum of risk allele counts by beta
beta_multiplications = risk_allele_counts.multiply(gwas_filtered['Beta'], axis=0)
total_beta_sum = beta_multiplications.sum()
valid_multiplications = beta_multiplications.count()

# Normalize by number of valid beta multiplications
beta_score = total_beta_sum / valid_multiplications if valid_multiplications > 0 else 0.0
return beta_score

if __name__ == "__main__":
gwas_file = input("Enter the GWAS dataset name: ")
geno_file = input("Enter the raw DNA file name: ")

gwas_df = load_gwas(gwas_file)
if gwas_df.empty:
print("Error loading GWAS dataset. Exiting.")
else:
geno_df = load_genotypes(geno_file)
beta_scores = calculate_beta(geno_df, gwas_df)
print(f"Calculated Beta Score: {beta_scores}")


^ Updated code that works with all datasets and gives the score out as a multiplier or OR similar to my Trait Predictor