insilica

Working with CPDB in python - Notebook

Mar 18, 2024
Eva Gao, Tom Luechtefeld

The below is the full python notebook for the post Working with CPDB in python.

Setup

python
import biobricks as bb
import pyspark
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

cpdb = bb.assets('cpdb')
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## build a table of chemicals, routes, species, and td50
ncintp = spark.read.parquet(cpdb.ncintp_parquet)
species = spark.read.parquet(cpdb.species_parquet)
route = spark.read.parquet(cpdb.route_parquet)
sdf = ncintp.join(species, 'species').select('chemcode','species','spname','route','td50')
sdf = sdf.join(route, 'route').select('chemcode','species','spname','rtename','td50')

## take the minimum td50 for each chemical
sdf = sdf.groupBy('chemcode','spname','rtename').agg(F.expr('min(td50)').alias('td50'))

## filter by td50 < 10000 and rtename in gavage, diet or inhalation
sdf = sdf.filter(sdf['td50'] < 1000)
sdf = sdf.filter(sdf['rtename'].isin('gavage', 'diet', 'inhalation'))

## count entries for each species and route
sdf.groupby('spname','rtename').count().sort(F.col('count').desc()).show()

df = sdf.toPandas() #data frame with chemcode, species, route, and numeric td50

Now that we have a pandas dataframe we can build some plots.

TD50 CHART

python
def td50_chart():
    # Creating the histogram with the updated data
    fig, ax = plt.subplots(figsize=(10, 6))

    # Plotting the histogram with all bars in greenish color
    ax.hist(df['td50'], bins=30, color="#1b9e77", edgecolor='white')

    # Setting the title and labels
    ax.set_title('TD50 Values Distribution', fontsize=20, color='white', fontweight='bold')
    ax.set_xlabel('TD50 Values', fontsize=14, color='white')
    ax.set_ylabel('Counts', fontsize=14, color='white')

    # Setting the theme
    ax.set_facecolor('#333333')
    fig.patch.set_facecolor('#333333')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')

    # Removing the frame
    for spine in ax.spines.values():
        spine.set_visible(False)

    # Adjusting the layout
    plt.tight_layout()
    plt.savefig('static/images/2024-03-12-cpdb/TD50_distribution.png', format='png', dpi=400)

td50_chart()
TD50s for all chemicals with TD50 less than 1000 mg/kg/day in CPDB

Build a plot for mice and rats

python
def species_analysis():

    colnames = df['spname'].unique()
    g = sns.FacetGrid(df, col='spname', col_wrap=2, height=6, aspect=1.5, sharex=False, sharey=True)

    # Map the histogram plot to the FacetGrid
    g.map(plt.hist, 'td50', bins=30, color="#1b9e77", edgecolor='white', density=True)

    # Customize the plot
    g.set_titles(col_template='{col_name}', fontsize=40, color='green', fontweight='bold')
    g.set_xlabels('TD50', color='white', fontsize=20)
    g.set_ylabels('Frequency', color='white', fontsize=20)
    g.set_xticklabels(fontsize=16, color='white')
    g.set_yticklabels(fontsize=16, color='white')
    g.set(xlim=(0, 1000))
    # Adjust subplot parameters to ensure the background is consistently dark
    g.fig.subplots_adjust(wspace=0.05)  # Reduce space between subplots
    g.fig.patch.set_facecolor('#333333')  # Set the figure background to dark

    for ax, title in zip(g.axes.flat,df['spname'].unique()):
        ax.set_title(title, fontsize=40, color='white', fontweight='bold')  # Adjust font size here

    # Remove the frame from both subplots
    for ax in g.axes.flat:
        ax.tick_params(colors='white', which='both')
        ax.set_facecolor('#333333')
        for spine in ax.spines.values():
            spine.set_visible(False)
        
        # Add a subtle white grid
        ax.grid(color='white', linestyle='-', linewidth=0.2, alpha=0.3)

    # Adjust the layout to fit the shared title and ensure consistency
    g.fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    g.fig.savefig('static/images/2024-03-12-cpdb/TD50_distribution_mouse_rat_dark_bg.png', format='png', facecolor=g.fig.get_facecolor(), edgecolor='none')

species_analysis()
TD50s for rats and mice in CPDB

Build a histogram for each route

python
def route_analysis():

    # Create a FacetGrid
    g = sns.FacetGrid(df, col='rtename', col_wrap=3, height=6, aspect=1.5, sharex=False, sharey=True)

    # Map the histogram plot to the FacetGrid
    g.map(plt.hist, 'td50', bins=30, color="#1b9e77", edgecolor='white', density=True)

    # Customize the plot
    g.set_titles(col_template='{col_name}', fontsize=50, color='white', fontweight='bold')
    g.set_xlabels('TD50', color='white')
    g.set_ylabels('Frequency', color='white')
    g.set_xlabels('TD50', color='white', fontsize=20)
    g.set_ylabels('Frequency', color='white', fontsize=20)
    g.set_xticklabels(fontsize=16, color='white')
    g.set_yticklabels(fontsize=16, color='white')
    g.set(xlim=(0, 1000))

    # Adjust subplot parameters to ensure the background is consistently dark
    g.fig.subplots_adjust(wspace=0.05)  # Reduce space between subplots
    g.fig.patch.set_facecolor('#333333')  # Set the figure background to dark

    for ax, title in zip(g.axes.flat,df['rtename'].unique()):
        ax.set_title(title, fontsize=40, color='white', fontweight='bold')  # Adjust font size here

    # Remove the frame from both subplots
    for ax in g.axes.flat:
        ax.tick_params(colors='white', which='both')
        ax.set_facecolor('#333333')
        for spine in ax.spines.values():
            spine.set_visible(False)
        
        # Add a subtle white grid
        ax.grid(color='white', linestyle='-', linewidth=0.2, alpha=0.3)

    # Set a common/shared title for the subplots
    # g.fig.suptitle('TD50 Values Distribution', fontsize=24, color='white', fontweight='bold', va='center')

    # Adjust the layout to fit the shared title and ensure consistency
    g.fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    g.fig.savefig('static/images/2024-03-12-cpdb/TD50_distribution_route_dark_bg.png', format='png', facecolor=g.fig.get_facecolor(), edgecolor='none')

    # Show the plot
    plt.show()

route_analysis()
TD50s for different administration routes in CPDB

Get Most Toxic Examples

python
chemname = spark.read.parquet(cpdb.chemname_parquet)
intp = spark.read.parquet(cpdb.ncintp_parquet)
min_td50 = intp.groupBy('chemcode').agg(F.min('td50').alias('td50'))
df = min_td50.join(chemname,'chemcode').sort(F.col('td50').asc())
df.select('name','td50').show(10,truncate=False)
# +-----------------------------------+-------+
# |name                                |td50   |
# +------------------------------------+-------+
# |2,3,7,8-TETRACHLORODIBENZO-p-DIOXIN |1.21E-5|
# |HCDD MIXTURE                        |5.96E-4|
# |o-CHLOROBENZALMALONONITRILE         |0.00649|
# |OZONE                               |0.0156 |
# |RIDDELLIINE                         |0.0267 |
# |THIO-TEPA                           |0.0332 |
# |OCHRATOXIN A                        |0.0579 |
# |POLYBROMINATED BIPHENYL MIXTURE     |0.0645 |
# |COBALT SULFATE HEPTAHYDRATE         |0.0826 |
# |LASIOCARPINE                        |0.102  |
# +------------------------------------+-------+