import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from RAISING.hp_tune_select import *
We will work on the new simulations generated under demographic model 1(D1) discussed in the paper. After the QC and recoding we have genotype count matrix (*.012 files). There are 1250 individuals from 25 populations which are evolving a=under a linear gradient. Here we are working on the first replicate VCF file at 200 generation under demographic model 1.
geno_df = pd.read_csv("spatial_2D_sim_5x5pop_mut_msprime_1_200_QC.012",sep = "\t",header = None)
geno_df = geno_df.iloc[:,1:]
Since SLiM generates position in VCF files in 1-based index while actual positions are 0-based in simulation setup. Therefore we will update the position to align and compare with the actual loci under simulations. This needs to taken care of if user are working with these simulation VCF files from the project.
pos_id = pd.read_csv("spatial_2D_sim_5x5pop_mut_msprime_1_200_QC.012.pos",sep = "\t",header = None)
pos_id.iloc[:,1] = pos_id.iloc[:,1]-1
geno_df.columns = list(pos_id.iloc[:,1])
geno_df
Loading the environment file which contains both linear and quadratic environment. We will be working with the linear environment gradient which is the selective pressure in the loaded simulated data.
df = pd.read_csv("lin_quad_envdata.txt",sep = "\t")
y_data = df.iloc[:,[0]]
print(y_data.head())
We are performing the hyperparameter tuning using Bayesian optimization with user specified hyperparameter space.
{
"layers": {"min_value": 1,"max_value": 3},
"units": {"min_value": 100,"max_value": 300},
"learning_rate": {"min_value": 1e-4,"max_value": 1e-2,"sampling": "log"},
"activation": {"values": ["relu"]},
"l1_regularizer": {"min_value": 1e-3,"max_value": 1e-2,"sampling": "log"},
"l2_regularizer": {"min_value": 0,"max_value": 0,"sampling": "log"},
"dropout": {"min_value": 0.05,"max_value": 0.5,"sampling": "linear"},
"optimizer": {"values": ["adam"]},
"weight_initializer":{"values":["he_normal"]}
}
model = hp_optimization(input_data=geno_df, output_data = y_data,objective_fun = "val_loss",output_class = "continuous",Standardize = False,
config_file ="hyperparameter_config.json",cross_validation = False,max_trials = 50,min_delta = 0.05,patience = 5)
def minmaxscale(vec):
return((vec - min(vec))/(max(vec) - min(vec)))
GenFeat_df = feature_importance(input_data = geno_df,output_data = y_data,feature_set = geno_df.columns.to_list(),
Standardize = False,iteration = 5,feature_method = "DeepFeatImp",patience = 5,epochs = 100)
GenFeat_df["scaled_feat"] = minmaxscale(vec = GenFeat_df.iloc[:,1])
actual_pos = pd.read_csv("QT1_position_1.txt",header = None)
actual_pos = actual_pos.T
actual_pos.columns = ["POS","effect_size"]
actual_pos["POS"] = actual_pos["POS"].astype(int)
plt.figure(figsize=(10, 6))
plt.plot(GenFeat_df['features'], GenFeat_df['scaled_feat'], marker='o',linestyle='-', color='b',alpha = 0.4)
highlight_df = GenFeat_df[GenFeat_df['features'].isin(actual_pos['POS'])]
plt.scatter(highlight_df['features'], highlight_df['scaled_feat'], color='red', zorder=5, label='selected loci')
for idx, row in highlight_df.iterrows():
effect_size = actual_pos.loc[actual_pos['POS'] == row['features'], 'effect_size'].values[0]
plt.text(row['features'], row['scaled_feat'], f'{effect_size:.3f}', fontsize=12, color='black', ha='center',zorder= 10)
plt.xlabel('Genetic Loci')
plt.ylabel('Feature Importance')
plt.title('Feature importance of genetic variants')
plt.legend()
plt.grid(True)
plt.show()