toxicodb

ToxicoDB - A Toxicogenomics Brick

The toxicodb brick provides data from toxicodb.ca. In this post, we’ll use BioBricks.ai to look up gene and compound interaction data. To start, install biobricks.ai and then toxicodb:

import biobricks as bb, pyspark, subprocess, pandas as pd
import pyspark, pyspark.sql, pyspark.sql.functions as F, pyspark.sql.types as T

subprocess.run("biobricks install toxicodb", shell=True)
spark = pyspark.sql.SparkSession.builder.config("spark.driver.memory","4g").getOrCreate()
toxicodb = bb.assets('toxicodb')
tbls = [{"table": table, "count": spark.read.parquet(path).count()} for table, path in toxicodb.__dict__.items()]
pd.DataFrame(tbls).sort_values("count", ascending=False)
table count
2 TGGATEsHuman_TGGATEsHuman_parquet 47494698
4 TGGATEsRat_TGGATEsRat_parquet 39813228
0 DrugMatrixRat_DrugMatrixRat_parquet 11338749
5 EMEXP2458_EMEXP2458_parquet 1395730
1 genes_parquet 32092
3 compounds_parquet 234
rawhmn = spark.read.parquet(toxicodb.TGGATEsHuman_TGGATEsHuman_parquet)
exphmn = rawhmn\
  .filter(rawhmn.gene_symbol != '')\
  .groupBy("compound_name","gene_symbol","dose").agg(F.mean('expression').alias('exp'))\
  .orderBy("gene_symbol","dose")
exphmn.limit(5).toPandas()
                                                                                
compound_name gene_symbol dose exp
0 Hexachlorobenzene A1BG-AS1 0 5.070635
1 Flutamide A1BG-AS1 0 5.128496
2 Isoniazid A1BG-AS1 0 5.227362
3 Tacrine A1BG-AS1 0 5.197305
4 Buthionine sulfoximine A1BG-AS1 0 5.025494
window_spec = pyspark.sql.Window.partitionBy("compound_name", "gene_symbol").orderBy("dose")
udf_change = F.udf(lambda arr: arr[-1] - arr[0], T.FloatType())
hmn = exphmn\
  .withColumn("expressions", F.collect_list("exp").over(window_spec))\
  .groupBy("compound_name", "gene_symbol").agg(F.max("expressions").alias("expressions"))\
  .withColumn("expr_change", udf_change(F.col("expressions")))\
  .select("compound_name", "gene_symbol", "expr_change").toPandas()
hmn.head()
                                                                                
compound_name gene_symbol expr_change
0 1-Naphthyl isothiocyanate A1CF -0.007111
1 1-Naphthyl isothiocyanate A4GALT 0.182894
2 1-Naphthyl isothiocyanate A4GNT -0.148850
3 1-Naphthyl isothiocyanate AAR2 0.174779
4 1-Naphthyl isothiocyanate ABCA11P 0.263340
import plotly.express as px
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

# Assuming 'hmn' is pre-defined with 'compound_name', 'gene_symbol', 'expr_change'

# Calculate high variance genes and filter the dataframe
highvar_genes = hmn.groupby('gene_symbol')['expr_change'].var().nlargest(200)
hvdf = hmn[hmn['gene_symbol'].isin(highvar_genes.index)]

# Pivot the DataFrame
pivot_df = hvdf.pivot(index="compound_name", columns="gene_symbol", values="expr_change").fillna(0)

# Compute clusters
row_clusters = linkage(pdist(pivot_df, 'euclidean'), method='average')
col_clusters = linkage(pdist(pivot_df.T, 'euclidean'), method='average')

# Determine the order of rows and columns based on the clusters
row_order = leaves_list(row_clusters)
col_order = leaves_list(col_clusters)

# Reorder the DataFrame according to the clustering
clustered_df = pivot_df.iloc[row_order, col_order]

# Create the heatmap using Plotly Express
lbls = dict(x="", y="", color="Expression Change")
title = "TGGATES Gene Expression Change Over Dose in Human Cells"
fig = px.imshow(clustered_df, labels=lbls, x=clustered_df.columns, y=clustered_df.index, aspect="auto", title=title)

# Update the layout for transparent background and white text
fig.update_layout(
    xaxis={'side': 'bottom', 'title_standoff': 10, 'color': 'white'},
    yaxis={'title_standoff': 10, 'color': 'white'},
    title={'text': title, 'x': 0.5, 'xanchor': 'center', 'font': {'color': 'white'}},
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent surrounding
    margin=dict(l=20, r=20, t=50, b=20),
    font=dict(color='white', size=12)  # Set font color to white
)

# Show the figure
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json