The below is the full python notebook for the post Working with CPDB in python.

Setup

python

import biobricks as bb
import pyspark
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

cpdb = bb.assets('cpdb')
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## build a table of chemicals, routes, species, and td50
ncintp = spark.read.parquet(cpdb.ncintp_parquet)
species = spark.read.parquet(cpdb.species_parquet)
route = spark.read.parquet(cpdb.route_parquet)
sdf = ncintp.join(species, 'species').select('chemcode','species','spname','route','td50')
sdf = sdf.join(route, 'route').select('chemcode','species','spname','rtename','td50')

## take the minimum td50 for each chemical
sdf = sdf.groupBy('chemcode','spname','rtename').agg(F.expr('min(td50)').alias('td50'))

## filter by td50 < 10000 and rtename in gavage, diet or inhalation
sdf = sdf.filter(sdf['td50'] < 1000)
sdf = sdf.filter(sdf['rtename'].isin('gavage', 'diet', 'inhalation'))

## count entries for each species and route
sdf.groupby('spname','rtename').count().sort(F.col('count').desc()).show()

df = sdf.toPandas() #data frame with chemcode, species, route, and numeric td50

Now that we have a pandas dataframe we can build some plots.

TD50 CHART

python

def td50_chart():
    # Creating the histogram with the updated data
    fig, ax = plt.subplots(figsize=(10, 6))

    # Plotting the histogram with all bars in greenish color
    ax.hist(df['td50'], bins=30, color="#1b9e77", edgecolor='white')

    # Setting the title and labels
    ax.set_title('TD50 Values Distribution', fontsize=20, color='white', fontweight='bold')
    ax.set_xlabel('TD50 Values', fontsize=14, color='white')
    ax.set_ylabel('Counts', fontsize=14, color='white')

    # Setting the theme
    ax.set_facecolor('#333333')
    fig.patch.set_facecolor('#333333')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')

    # Removing the frame
    for spine in ax.spines.values():
        spine.set_visible(False)

    # Adjusting the layout
    plt.tight_layout()
    plt.savefig('static/images/2024-03-12-cpdb/TD50_distribution.png', format='png', dpi=400)

td50_chart()

TD50s for all chemicals with TD50 less than 1000 mg/kg/day in CPDB

Build a plot for mice and rats

python

def species_analysis():

    colnames = df['spname'].unique()
    g = sns.FacetGrid(df, col='spname', col_wrap=2, height=6, aspect=1.5, sharex=False, sharey=True)

    # Map the histogram plot to the FacetGrid
    g.map(plt.hist, 'td50', bins=30, color="#1b9e77", edgecolor='white', density=True)

    # Customize the plot
    g.set_titles(col_template='{col_name}', fontsize=40, color='green', fontweight='bold')
    g.set_xlabels('TD50', color='white', fontsize=20)
    g.set_ylabels('Frequency', color='white', fontsize=20)
    g.set_xticklabels(fontsize=16, color='white')
    g.set_yticklabels(fontsize=16, color='white')
    g.set(xlim=(0, 1000))
    # Adjust subplot parameters to ensure the background is consistently dark
    g.fig.subplots_adjust(wspace=0.05)  # Reduce space between subplots
    g.fig.patch.set_facecolor('#333333')  # Set the figure background to dark

    for ax, title in zip(g.axes.flat,df['spname'].unique()):
        ax.set_title(title, fontsize=40, color='white', fontweight='bold')  # Adjust font size here

    # Remove the frame from both subplots
    for ax in g.axes.flat:
        ax.tick_params(colors='white', which='both')
        ax.set_facecolor('#333333')
        for spine in ax.spines.values():
            spine.set_visible(False)
        
        # Add a subtle white grid
        ax.grid(color='white', linestyle='-', linewidth=0.2, alpha=0.3)

    # Adjust the layout to fit the shared title and ensure consistency
    g.fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    g.fig.savefig('static/images/2024-03-12-cpdb/TD50_distribution_mouse_rat_dark_bg.png', format='png', facecolor=g.fig.get_facecolor(), edgecolor='none')

species_analysis()

TD50s for rats and mice in CPDB

Build a histogram for each route

python

def route_analysis():

    # Create a FacetGrid
    g = sns.FacetGrid(df, col='rtename', col_wrap=3, height=6, aspect=1.5, sharex=False, sharey=True)

    # Map the histogram plot to the FacetGrid
    g.map(plt.hist, 'td50', bins=30, color="#1b9e77", edgecolor='white', density=True)

    # Customize the plot
    g.set_titles(col_template='{col_name}', fontsize=50, color='white', fontweight='bold')
    g.set_xlabels('TD50', color='white')
    g.set_ylabels('Frequency', color='white')
    g.set_xlabels('TD50', color='white', fontsize=20)
    g.set_ylabels('Frequency', color='white', fontsize=20)
    g.set_xticklabels(fontsize=16, color='white')
    g.set_yticklabels(fontsize=16, color='white')
    g.set(xlim=(0, 1000))

    # Adjust subplot parameters to ensure the background is consistently dark
    g.fig.subplots_adjust(wspace=0.05)  # Reduce space between subplots
    g.fig.patch.set_facecolor('#333333')  # Set the figure background to dark

    for ax, title in zip(g.axes.flat,df['rtename'].unique()):
        ax.set_title(title, fontsize=40, color='white', fontweight='bold')  # Adjust font size here

    # Remove the frame from both subplots
    for ax in g.axes.flat:
        ax.tick_params(colors='white', which='both')
        ax.set_facecolor('#333333')
        for spine in ax.spines.values():
            spine.set_visible(False)
        
        # Add a subtle white grid
        ax.grid(color='white', linestyle='-', linewidth=0.2, alpha=0.3)

    # Set a common/shared title for the subplots
    # g.fig.suptitle('TD50 Values Distribution', fontsize=24, color='white', fontweight='bold', va='center')

    # Adjust the layout to fit the shared title and ensure consistency
    g.fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    g.fig.savefig('static/images/2024-03-12-cpdb/TD50_distribution_route_dark_bg.png', format='png', facecolor=g.fig.get_facecolor(), edgecolor='none')

    # Show the plot
    plt.show()

route_analysis()

TD50s for different administration routes in CPDB

Get Most Toxic Examples

python

chemname = spark.read.parquet(cpdb.chemname_parquet)
intp = spark.read.parquet(cpdb.ncintp_parquet)
min_td50 = intp.groupBy('chemcode').agg(F.min('td50').alias('td50'))
df = min_td50.join(chemname,'chemcode').sort(F.col('td50').asc())
df.select('name','td50').show(10,truncate=False)
# +-----------------------------------+-------+
# |name                                |td50   |
# +------------------------------------+-------+
# |2,3,7,8-TETRACHLORODIBENZO-p-DIOXIN |1.21E-5|
# |HCDD MIXTURE                        |5.96E-4|
# |o-CHLOROBENZALMALONONITRILE         |0.00649|
# |OZONE                               |0.0156 |
# |RIDDELLIINE                         |0.0267 |
# |THIO-TEPA                           |0.0332 |
# |OCHRATOXIN A                        |0.0579 |
# |POLYBROMINATED BIPHENYL MIXTURE     |0.0645 |
# |COBALT SULFATE HEPTAHYDRATE         |0.0826 |
# |LASIOCARPINE                        |0.102  |
# +------------------------------------+-------+