{
"cells": [
{
"cell_type": "code",
"execution_count": 67,
"id": "geological-comfort",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/shellytrigg/bin/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:4441: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" return super().rename(\n"
]
}
],
"source": [
"# Import necessary libraries\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from pycombat import Combat\n",
"\n",
"# Load the gene counts data\n",
"gene_counts_path = '/mnt/c/Users/strigg/Downloads/salmon.merged.gene_counts (2).tsv'\n",
"metadata_path = '../data/SraRunTable (3).csv'\n",
"\n",
"# Replace with the correct paths to your files\n",
"gene_counts = pd.read_csv(gene_counts_path, sep='\\t')\n",
"metadata = pd.read_csv(metadata_path)\n",
"\n",
"# Simplify metadata for alignment\n",
"metadata_subset = metadata[['Experiment', 'treatment', 'Collection_Date', 'batch']]\n",
"metadata_subset.rename(columns={'Experiment': 'Sample'}, inplace=True)\n",
"\n",
"# Align gene counts with metadata\n",
"gene_count_samples = set(gene_counts.columns[2:])\n",
"metadata_samples = set(metadata_subset['Sample'])\n",
"aligned_gene_counts = gene_counts[['gene_id'] + list(gene_count_samples.intersection(metadata_samples))]\n",
"\n",
"# Filter genes with low expression (e.g., total counts < 10 across all samples)\n",
"filtered_gene_counts = aligned_gene_counts.loc[\n",
" aligned_gene_counts.iloc[:, 1:].sum(axis=1) > 10\n",
"]\n",
"\n",
"# Calculate CPM normalization\n",
"total_counts_per_sample = filtered_gene_counts.iloc[:, 1:].sum(axis=0)\n",
"cpm_counts = filtered_gene_counts.iloc[:, 1:].div(total_counts_per_sample, axis=1) * 1e6\n",
"cpm_counts.insert(0, 'gene_id', filtered_gene_counts['gene_id'])\n",
"\n",
"# Transpose CPM data for visualization\n",
"cpm_values = cpm_counts.drop(columns='gene_id').T\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "stunning-direction",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" ... | \n",
" 38253 | \n",
" 38254 | \n",
" 38256 | \n",
" 38257 | \n",
" 38258 | \n",
" 38259 | \n",
" 38260 | \n",
" 38261 | \n",
" 38262 | \n",
" 38263 | \n",
"
\n",
" \n",
" \n",
" \n",
" SRX9845534 | \n",
" 0.369614 | \n",
" 58.768655 | \n",
" 8.466382 | \n",
" 0.327109 | \n",
" 8.748028 | \n",
" 9.076985 | \n",
" 17.556674 | \n",
" 0.739228 | \n",
" 96.469671 | \n",
" 13.017257 | \n",
" ... | \n",
" 11.088425 | \n",
" 1.655687 | \n",
" 22.361658 | \n",
" 3.326528 | \n",
" 0.000000 | \n",
" 25.522044 | \n",
" 1.108843 | \n",
" 5.359406 | \n",
" 36.961418 | \n",
" 225.279843 | \n",
"
\n",
" \n",
" SRX9845541 | \n",
" 0.193769 | \n",
" 1.607303 | \n",
" 40.219177 | \n",
" 1.740887 | \n",
" 10.671240 | \n",
" 8.876954 | \n",
" 16.787383 | \n",
" 0.535768 | \n",
" 57.505718 | \n",
" 8.825163 | \n",
" ... | \n",
" 5.893443 | \n",
" 0.000000 | \n",
" 20.180578 | \n",
" 1.785892 | \n",
" 0.000000 | \n",
" 162.761183 | \n",
" 1.428713 | \n",
" 4.286140 | \n",
" 43.575761 | \n",
" 185.732753 | \n",
"
\n",
" \n",
" SRX9845526 | \n",
" 0.171783 | \n",
" 10.520205 | \n",
" 6.602058 | \n",
" 1.594947 | \n",
" 16.920632 | \n",
" 12.063236 | \n",
" 14.606324 | \n",
" 2.063937 | \n",
" 121.137229 | \n",
" 13.548794 | \n",
" ... | \n",
" 7.303162 | \n",
" 0.317529 | \n",
" 35.880752 | \n",
" 2.436875 | \n",
" 0.317529 | \n",
" 121.274560 | \n",
" 2.698995 | \n",
" 5.080460 | \n",
" 28.577590 | \n",
" 197.502900 | \n",
"
\n",
" \n",
" SRX9845530 | \n",
" 0.000000 | \n",
" 1.298883 | \n",
" 4.448240 | \n",
" 1.832146 | \n",
" 17.588459 | \n",
" 8.684763 | \n",
" 14.576350 | \n",
" 1.010242 | \n",
" 113.002650 | \n",
" 5.647253 | \n",
" ... | \n",
" 9.092179 | \n",
" 3.929697 | \n",
" 31.606146 | \n",
" 3.342458 | \n",
" 0.000000 | \n",
" 2.002300 | \n",
" 0.432961 | \n",
" 5.339851 | \n",
" 27.709498 | \n",
" 77.500001 | \n",
"
\n",
" \n",
" SRX9845543 | \n",
" 0.673737 | \n",
" 2.919526 | \n",
" 71.662472 | \n",
" 1.796632 | \n",
" 15.387027 | \n",
" 22.570634 | \n",
" 1.572053 | \n",
" 0.000000 | \n",
" 97.692296 | \n",
" 3.401922 | \n",
" ... | \n",
" 8.309421 | \n",
" 1.972702 | \n",
" 47.610739 | \n",
" 1.572053 | \n",
" 0.000000 | \n",
" 286.762850 | \n",
" 2.919526 | \n",
" 4.042421 | \n",
" 85.789162 | \n",
" 375.271438 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 32707 columns
\n",
"
"
],
"text/plain": [
" 0 1 4 5 6 7 \\\n",
"SRX9845534 0.369614 58.768655 8.466382 0.327109 8.748028 9.076985 \n",
"SRX9845541 0.193769 1.607303 40.219177 1.740887 10.671240 8.876954 \n",
"SRX9845526 0.171783 10.520205 6.602058 1.594947 16.920632 12.063236 \n",
"SRX9845530 0.000000 1.298883 4.448240 1.832146 17.588459 8.684763 \n",
"SRX9845543 0.673737 2.919526 71.662472 1.796632 15.387027 22.570634 \n",
"\n",
" 9 10 11 12 ... 38253 \\\n",
"SRX9845534 17.556674 0.739228 96.469671 13.017257 ... 11.088425 \n",
"SRX9845541 16.787383 0.535768 57.505718 8.825163 ... 5.893443 \n",
"SRX9845526 14.606324 2.063937 121.137229 13.548794 ... 7.303162 \n",
"SRX9845530 14.576350 1.010242 113.002650 5.647253 ... 9.092179 \n",
"SRX9845543 1.572053 0.000000 97.692296 3.401922 ... 8.309421 \n",
"\n",
" 38254 38256 38257 38258 38259 38260 \\\n",
"SRX9845534 1.655687 22.361658 3.326528 0.000000 25.522044 1.108843 \n",
"SRX9845541 0.000000 20.180578 1.785892 0.000000 162.761183 1.428713 \n",
"SRX9845526 0.317529 35.880752 2.436875 0.317529 121.274560 2.698995 \n",
"SRX9845530 3.929697 31.606146 3.342458 0.000000 2.002300 0.432961 \n",
"SRX9845543 1.972702 47.610739 1.572053 0.000000 286.762850 2.919526 \n",
"\n",
" 38261 38262 38263 \n",
"SRX9845534 5.359406 36.961418 225.279843 \n",
"SRX9845541 4.286140 43.575761 185.732753 \n",
"SRX9845526 5.080460 28.577590 197.502900 \n",
"SRX9845530 5.339851 27.709498 77.500001 \n",
"SRX9845543 4.042421 85.789162 375.271438 \n",
"\n",
"[5 rows x 32707 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpm_values.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "chemical-seminar",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"( SRX9845534 SRX9845541 SRX9845526 SRX9845530 SRX9845543 \\\n",
" count 32707.000000 32707.000000 32707.000000 32707.000000 32707.000000 \n",
" mean 165.440053 171.200147 192.577787 211.851658 136.141405 \n",
" std 1075.332847 957.385064 1048.747322 1345.628396 686.896627 \n",
" min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
" 25% 2.000000 2.000000 4.000000 2.489000 2.000000 \n",
" 50% 23.895000 25.000000 33.037000 28.248000 20.406000 \n",
" 75% 103.660000 110.792000 131.232500 126.000000 96.000000 \n",
" max 91474.000000 85682.000000 87475.000000 137044.000000 69707.000000 \n",
" \n",
" SRX9845527 SRX9845506 SRX9845518 SRX9845501 \\\n",
" count 32707.000000 32707.000000 32707.000000 32707.000000 \n",
" mean 179.994930 183.099091 180.849091 159.277506 \n",
" std 1453.555755 1201.931238 974.852911 911.736745 \n",
" min 0.000000 0.000000 0.000000 0.000000 \n",
" 25% 2.000000 2.000000 5.581500 2.403500 \n",
" 50% 22.000000 25.318000 32.227000 25.640000 \n",
" 75% 98.000000 112.560500 125.000000 111.000000 \n",
" max 147753.000000 112650.000000 86589.000000 86268.000000 \n",
" \n",
" SRX9845536 ... SRX9845505 SRX9845531 SRX9845510 \\\n",
" count 32707.000000 ... 32707.000000 32707.000000 32707.000000 \n",
" mean 236.052346 ... 171.201700 206.463199 165.741786 \n",
" std 1271.652215 ... 986.912211 1131.480546 1038.359313 \n",
" min 0.000000 ... 0.000000 0.000000 0.000000 \n",
" 25% 5.000000 ... 3.000000 2.000000 3.000000 \n",
" 50% 43.000000 ... 27.522000 23.721000 28.000000 \n",
" 75% 172.000000 ... 113.682000 125.000000 119.000000 \n",
" max 100197.000000 ... 97256.000000 98612.000000 93615.000000 \n",
" \n",
" SRX9845535 SRX9845513 SRX9845512 SRX9845525 \\\n",
" count 32707.000000 32707.000000 32707.000000 32707.000000 \n",
" mean 199.503417 191.538372 131.001250 230.778746 \n",
" std 1311.955681 1232.982014 721.962074 1613.978983 \n",
" min 0.000000 0.000000 0.000000 0.000000 \n",
" 25% 3.000000 4.000000 2.000000 3.000000 \n",
" 50% 28.601000 30.000000 20.700000 30.670000 \n",
" 75% 118.922500 121.000000 93.943500 139.000000 \n",
" max 130487.000000 113471.000000 77585.000000 145177.000000 \n",
" \n",
" SRX9845521 SRX9845500 SRX9845511 \n",
" count 32707.000000 32707.000000 32707.000000 \n",
" mean 210.007663 148.738319 129.222067 \n",
" std 1196.141346 959.965734 688.428395 \n",
" min 0.000000 0.000000 0.000000 \n",
" 25% 3.000000 1.000000 2.000000 \n",
" 50% 31.306000 18.320000 20.000000 \n",
" 75% 134.000000 88.000000 90.000000 \n",
" max 116651.000000 95149.000000 67648.000000 \n",
" \n",
" [8 rows x 44 columns],\n",
" gene_id LOC111099029LOC111099030LOC111099033LOC1110990...\n",
" SRX9845534 5411047.807\n",
" SRX9845541 5599443.202\n",
" SRX9845526 6298641.692\n",
" SRX9845530 6929032.179\n",
" SRX9845543 4452776.927\n",
" SRX9845527 5887094.163\n",
" SRX9845506 5988621.973\n",
" SRX9845518 5915031.218\n",
" SRX9845501 5209489.38\n",
" SRX9845536 7720564.083\n",
" SRX9845529 6904563.668\n",
" SRX9845524 8228094.16\n",
" SRX9845507 5451326.526\n",
" SRX9845540 5058340.811\n",
" SRX9845517 5033662.657\n",
" SRX9845532 7787956.536\n",
" SRX9845503 5090347.989\n",
" SRX9845508 5380343.456\n",
" SRX9845537 7915664.803\n",
" SRX9845520 6881819.531\n",
" SRX9845523 4540587.389\n",
" SRX9845528 6565149.428\n",
" SRX9845516 6858787.739\n",
" SRX9845514 8076393.261\n",
" SRX9845522 9031287.071\n",
" SRX9845502 4523632.462\n",
" SRX9845509 4345607.019\n",
" SRX9845539 5521862.433\n",
" SRX9845542 4701381.722\n",
" SRX9845504 4731008.048\n",
" SRX9845538 7748085.775\n",
" SRX9845515 8087706.403\n",
" SRX9845519 5850139.193\n",
" SRX9845533 8105079.858\n",
" SRX9845505 5599493.994\n",
" SRX9845531 6752791.859\n",
" SRX9845510 5420916.597\n",
" SRX9845535 6525158.253\n",
" SRX9845513 6264645.549\n",
" SRX9845512 4284657.883\n",
" SRX9845525 7548080.453\n",
" SRX9845521 6868720.618\n",
" SRX9845500 4864784.191\n",
" SRX9845511 4226466.132\n",
" dtype: object)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Summary statistics for the raw (unnormalized) gene count data\n",
"raw_summary = filtered_gene_counts.describe()\n",
"\n",
"# Calculate library sizes (total counts per sample)\n",
"library_sizes = filtered_gene_counts.sum(axis=0)\n",
"\n",
"# Display key statistics\n",
"raw_summary, library_sizes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "divided-commander",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Boxplot of CPM-normalized counts\n",
"plt.figure(figsize=(15, 5))\n",
"sns.boxplot(data=np.log10(raw_summary.T[['mean']] + 1), palette=\"coolwarm\")\n",
"plt.title(\"Distribution of Log10-Transformed mean CPM Across Samples\")\n",
"plt.xlabel(\"Samples\")\n",
"plt.ylabel(\"Log10(CPM + 1)\")\n",
"plt.xticks(rotation=90)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "accredited-lesbian",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Boxplot of CPM-normalized max counts\n",
"plt.figure(figsize=(15, 5))\n",
"sns.boxplot(data=np.log10(raw_summary.T[['max']] + 1), palette=\"coolwarm\")\n",
"plt.title(\"Distribution of Log10-Transformed mean CPM Across Samples\")\n",
"plt.xlabel(\"Samples\")\n",
"plt.ylabel(\"Log10(CPM + 1)\")\n",
"plt.xticks(rotation=90)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "necessary-wonder",
"metadata": {},
"outputs": [],
"source": [
"# Boxplot of CPM-normalized max counts\n",
"plt.figure(figsize=(15, 5))\n",
"sns.boxplot(data=np.log10(raw_summary.T[['max']] + 1), palette=\"coolwarm\")\n",
"plt.title(\"Distribution of Log10-Transformed mean CPM Across Samples\")\n",
"plt.xlabel(\"Samples\")\n",
"plt.ylabel(\"Log10(CPM + 1)\")\n",
"plt.xticks(rotation=90)\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "consecutive-death",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" SRX9845534 | \n",
" 32707.0 | \n",
" 165.440053 | \n",
" 1075.332847 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 23.895 | \n",
" 103.6600 | \n",
" 91474.000 | \n",
"
\n",
" \n",
" SRX9845541 | \n",
" 32707.0 | \n",
" 171.200147 | \n",
" 957.385064 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 25.000 | \n",
" 110.7920 | \n",
" 85682.000 | \n",
"
\n",
" \n",
" SRX9845526 | \n",
" 32707.0 | \n",
" 192.577787 | \n",
" 1048.747322 | \n",
" 0.0 | \n",
" 4.0000 | \n",
" 33.037 | \n",
" 131.2325 | \n",
" 87475.000 | \n",
"
\n",
" \n",
" SRX9845530 | \n",
" 32707.0 | \n",
" 211.851658 | \n",
" 1345.628396 | \n",
" 0.0 | \n",
" 2.4890 | \n",
" 28.248 | \n",
" 126.0000 | \n",
" 137044.000 | \n",
"
\n",
" \n",
" SRX9845543 | \n",
" 32707.0 | \n",
" 136.141405 | \n",
" 686.896627 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 20.406 | \n",
" 96.0000 | \n",
" 69707.000 | \n",
"
\n",
" \n",
" SRX9845527 | \n",
" 32707.0 | \n",
" 179.994930 | \n",
" 1453.555755 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 22.000 | \n",
" 98.0000 | \n",
" 147753.000 | \n",
"
\n",
" \n",
" SRX9845506 | \n",
" 32707.0 | \n",
" 183.099091 | \n",
" 1201.931238 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 25.318 | \n",
" 112.5605 | \n",
" 112650.000 | \n",
"
\n",
" \n",
" SRX9845518 | \n",
" 32707.0 | \n",
" 180.849091 | \n",
" 974.852911 | \n",
" 0.0 | \n",
" 5.5815 | \n",
" 32.227 | \n",
" 125.0000 | \n",
" 86589.000 | \n",
"
\n",
" \n",
" SRX9845501 | \n",
" 32707.0 | \n",
" 159.277506 | \n",
" 911.736745 | \n",
" 0.0 | \n",
" 2.4035 | \n",
" 25.640 | \n",
" 111.0000 | \n",
" 86268.000 | \n",
"
\n",
" \n",
" SRX9845536 | \n",
" 32707.0 | \n",
" 236.052346 | \n",
" 1271.652215 | \n",
" 0.0 | \n",
" 5.0000 | \n",
" 43.000 | \n",
" 172.0000 | \n",
" 100197.000 | \n",
"
\n",
" \n",
" SRX9845529 | \n",
" 32707.0 | \n",
" 211.103546 | \n",
" 1354.049251 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 27.000 | \n",
" 129.0000 | \n",
" 120295.000 | \n",
"
\n",
" \n",
" SRX9845524 | \n",
" 32707.0 | \n",
" 251.569822 | \n",
" 1655.717746 | \n",
" 0.0 | \n",
" 5.0000 | \n",
" 41.000 | \n",
" 164.9465 | \n",
" 164959.000 | \n",
"
\n",
" \n",
" SRX9845507 | \n",
" 32707.0 | \n",
" 166.671554 | \n",
" 1003.054131 | \n",
" 0.0 | \n",
" 1.0165 | \n",
" 19.489 | \n",
" 96.7280 | \n",
" 102536.000 | \n",
"
\n",
" \n",
" SRX9845540 | \n",
" 32707.0 | \n",
" 154.656215 | \n",
" 845.141190 | \n",
" 0.0 | \n",
" 1.0000 | \n",
" 18.126 | \n",
" 94.1795 | \n",
" 84558.000 | \n",
"
\n",
" \n",
" SRX9845517 | \n",
" 32707.0 | \n",
" 153.901693 | \n",
" 878.596550 | \n",
" 0.0 | \n",
" 2.5375 | \n",
" 24.088 | \n",
" 103.9995 | \n",
" 82089.000 | \n",
"
\n",
" \n",
" SRX9845532 | \n",
" 32707.0 | \n",
" 238.112836 | \n",
" 1340.437656 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 29.000 | \n",
" 145.0000 | \n",
" 124108.000 | \n",
"
\n",
" \n",
" SRX9845503 | \n",
" 32707.0 | \n",
" 155.634818 | \n",
" 847.085587 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 26.000 | \n",
" 110.3835 | \n",
" 83942.000 | \n",
"
\n",
" \n",
" SRX9845508 | \n",
" 32707.0 | \n",
" 164.501283 | \n",
" 940.254834 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 25.654 | \n",
" 108.0000 | \n",
" 101989.000 | \n",
"
\n",
" \n",
" SRX9845537 | \n",
" 32707.0 | \n",
" 242.017452 | \n",
" 1390.839993 | \n",
" 0.0 | \n",
" 5.0000 | \n",
" 42.000 | \n",
" 170.6250 | \n",
" 112791.000 | \n",
"
\n",
" \n",
" SRX9845520 | \n",
" 32707.0 | \n",
" 210.408155 | \n",
" 1202.519390 | \n",
" 0.0 | \n",
" 1.0000 | \n",
" 16.035 | \n",
" 104.0000 | \n",
" 108452.000 | \n",
"
\n",
" \n",
" SRX9845523 | \n",
" 32707.0 | \n",
" 138.826165 | \n",
" 729.033500 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 23.000 | \n",
" 101.0000 | \n",
" 71338.000 | \n",
"
\n",
" \n",
" SRX9845528 | \n",
" 32707.0 | \n",
" 200.726127 | \n",
" 1217.133325 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 28.208 | \n",
" 126.0380 | \n",
" 116776.000 | \n",
"
\n",
" \n",
" SRX9845516 | \n",
" 32707.0 | \n",
" 209.703970 | \n",
" 1261.841796 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 29.000 | \n",
" 129.0225 | \n",
" 120823.000 | \n",
"
\n",
" \n",
" SRX9845514 | \n",
" 32707.0 | \n",
" 246.931643 | \n",
" 1886.795410 | \n",
" 0.0 | \n",
" 4.0000 | \n",
" 36.667 | \n",
" 148.7890 | \n",
" 192241.000 | \n",
"
\n",
" \n",
" SRX9845522 | \n",
" 32707.0 | \n",
" 276.127039 | \n",
" 1382.403612 | \n",
" 0.0 | \n",
" 6.0000 | \n",
" 50.302 | \n",
" 201.2240 | \n",
" 112666.000 | \n",
"
\n",
" \n",
" SRX9845502 | \n",
" 32707.0 | \n",
" 138.307777 | \n",
" 821.717405 | \n",
" 0.0 | \n",
" 1.2725 | \n",
" 18.181 | \n",
" 85.0890 | \n",
" 80409.000 | \n",
"
\n",
" \n",
" SRX9845509 | \n",
" 32707.0 | \n",
" 132.864739 | \n",
" 728.780236 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 22.000 | \n",
" 92.0000 | \n",
" 74524.000 | \n",
"
\n",
" \n",
" SRX9845539 | \n",
" 32707.0 | \n",
" 168.828154 | \n",
" 1117.806114 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 23.000 | \n",
" 103.1225 | \n",
" 103742.000 | \n",
"
\n",
" \n",
" SRX9845542 | \n",
" 32707.0 | \n",
" 143.742371 | \n",
" 902.337729 | \n",
" 0.0 | \n",
" 1.3070 | \n",
" 20.015 | \n",
" 92.8320 | \n",
" 97866.000 | \n",
"
\n",
" \n",
" SRX9845504 | \n",
" 32707.0 | \n",
" 144.648181 | \n",
" 773.470471 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 23.275 | \n",
" 99.0000 | \n",
" 73309.000 | \n",
"
\n",
" \n",
" SRX9845538 | \n",
" 32707.0 | \n",
" 236.893808 | \n",
" 1763.467141 | \n",
" 0.0 | \n",
" 4.0000 | \n",
" 34.332 | \n",
" 143.0000 | \n",
" 168449.000 | \n",
"
\n",
" \n",
" SRX9845515 | \n",
" 32707.0 | \n",
" 247.277537 | \n",
" 1320.928892 | \n",
" 0.0 | \n",
" 4.0000 | \n",
" 39.000 | \n",
" 168.0000 | \n",
" 121237.000 | \n",
"
\n",
" \n",
" SRX9845519 | \n",
" 32707.0 | \n",
" 178.865050 | \n",
" 1036.722815 | \n",
" 0.0 | \n",
" 3.9995 | \n",
" 29.826 | \n",
" 125.5245 | \n",
" 77518.988 | \n",
"
\n",
" \n",
" SRX9845533 | \n",
" 32707.0 | \n",
" 247.808722 | \n",
" 1637.109146 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 22.000 | \n",
" 124.0000 | \n",
" 152243.000 | \n",
"
\n",
" \n",
" SRX9845505 | \n",
" 32707.0 | \n",
" 171.201700 | \n",
" 986.912211 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 27.522 | \n",
" 113.6820 | \n",
" 97256.000 | \n",
"
\n",
" \n",
" SRX9845531 | \n",
" 32707.0 | \n",
" 206.463199 | \n",
" 1131.480546 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 23.721 | \n",
" 125.0000 | \n",
" 98612.000 | \n",
"
\n",
" \n",
" SRX9845510 | \n",
" 32707.0 | \n",
" 165.741786 | \n",
" 1038.359313 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 28.000 | \n",
" 119.0000 | \n",
" 93615.000 | \n",
"
\n",
" \n",
" SRX9845535 | \n",
" 32707.0 | \n",
" 199.503417 | \n",
" 1311.955681 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 28.601 | \n",
" 118.9225 | \n",
" 130487.000 | \n",
"
\n",
" \n",
" SRX9845513 | \n",
" 32707.0 | \n",
" 191.538372 | \n",
" 1232.982014 | \n",
" 0.0 | \n",
" 4.0000 | \n",
" 30.000 | \n",
" 121.0000 | \n",
" 113471.000 | \n",
"
\n",
" \n",
" SRX9845512 | \n",
" 32707.0 | \n",
" 131.001250 | \n",
" 721.962074 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 20.700 | \n",
" 93.9435 | \n",
" 77585.000 | \n",
"
\n",
" \n",
" SRX9845525 | \n",
" 32707.0 | \n",
" 230.778746 | \n",
" 1613.978983 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 30.670 | \n",
" 139.0000 | \n",
" 145177.000 | \n",
"
\n",
" \n",
" SRX9845521 | \n",
" 32707.0 | \n",
" 210.007663 | \n",
" 1196.141346 | \n",
" 0.0 | \n",
" 3.0000 | \n",
" 31.306 | \n",
" 134.0000 | \n",
" 116651.000 | \n",
"
\n",
" \n",
" SRX9845500 | \n",
" 32707.0 | \n",
" 148.738319 | \n",
" 959.965734 | \n",
" 0.0 | \n",
" 1.0000 | \n",
" 18.320 | \n",
" 88.0000 | \n",
" 95149.000 | \n",
"
\n",
" \n",
" SRX9845511 | \n",
" 32707.0 | \n",
" 129.222067 | \n",
" 688.428395 | \n",
" 0.0 | \n",
" 2.0000 | \n",
" 20.000 | \n",
" 90.0000 | \n",
" 67648.000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min 25% 50% 75% \\\n",
"SRX9845534 32707.0 165.440053 1075.332847 0.0 2.0000 23.895 103.6600 \n",
"SRX9845541 32707.0 171.200147 957.385064 0.0 2.0000 25.000 110.7920 \n",
"SRX9845526 32707.0 192.577787 1048.747322 0.0 4.0000 33.037 131.2325 \n",
"SRX9845530 32707.0 211.851658 1345.628396 0.0 2.4890 28.248 126.0000 \n",
"SRX9845543 32707.0 136.141405 686.896627 0.0 2.0000 20.406 96.0000 \n",
"SRX9845527 32707.0 179.994930 1453.555755 0.0 2.0000 22.000 98.0000 \n",
"SRX9845506 32707.0 183.099091 1201.931238 0.0 2.0000 25.318 112.5605 \n",
"SRX9845518 32707.0 180.849091 974.852911 0.0 5.5815 32.227 125.0000 \n",
"SRX9845501 32707.0 159.277506 911.736745 0.0 2.4035 25.640 111.0000 \n",
"SRX9845536 32707.0 236.052346 1271.652215 0.0 5.0000 43.000 172.0000 \n",
"SRX9845529 32707.0 211.103546 1354.049251 0.0 2.0000 27.000 129.0000 \n",
"SRX9845524 32707.0 251.569822 1655.717746 0.0 5.0000 41.000 164.9465 \n",
"SRX9845507 32707.0 166.671554 1003.054131 0.0 1.0165 19.489 96.7280 \n",
"SRX9845540 32707.0 154.656215 845.141190 0.0 1.0000 18.126 94.1795 \n",
"SRX9845517 32707.0 153.901693 878.596550 0.0 2.5375 24.088 103.9995 \n",
"SRX9845532 32707.0 238.112836 1340.437656 0.0 3.0000 29.000 145.0000 \n",
"SRX9845503 32707.0 155.634818 847.085587 0.0 2.0000 26.000 110.3835 \n",
"SRX9845508 32707.0 164.501283 940.254834 0.0 3.0000 25.654 108.0000 \n",
"SRX9845537 32707.0 242.017452 1390.839993 0.0 5.0000 42.000 170.6250 \n",
"SRX9845520 32707.0 210.408155 1202.519390 0.0 1.0000 16.035 104.0000 \n",
"SRX9845523 32707.0 138.826165 729.033500 0.0 2.0000 23.000 101.0000 \n",
"SRX9845528 32707.0 200.726127 1217.133325 0.0 3.0000 28.208 126.0380 \n",
"SRX9845516 32707.0 209.703970 1261.841796 0.0 3.0000 29.000 129.0225 \n",
"SRX9845514 32707.0 246.931643 1886.795410 0.0 4.0000 36.667 148.7890 \n",
"SRX9845522 32707.0 276.127039 1382.403612 0.0 6.0000 50.302 201.2240 \n",
"SRX9845502 32707.0 138.307777 821.717405 0.0 1.2725 18.181 85.0890 \n",
"SRX9845509 32707.0 132.864739 728.780236 0.0 2.0000 22.000 92.0000 \n",
"SRX9845539 32707.0 168.828154 1117.806114 0.0 2.0000 23.000 103.1225 \n",
"SRX9845542 32707.0 143.742371 902.337729 0.0 1.3070 20.015 92.8320 \n",
"SRX9845504 32707.0 144.648181 773.470471 0.0 2.0000 23.275 99.0000 \n",
"SRX9845538 32707.0 236.893808 1763.467141 0.0 4.0000 34.332 143.0000 \n",
"SRX9845515 32707.0 247.277537 1320.928892 0.0 4.0000 39.000 168.0000 \n",
"SRX9845519 32707.0 178.865050 1036.722815 0.0 3.9995 29.826 125.5245 \n",
"SRX9845533 32707.0 247.808722 1637.109146 0.0 2.0000 22.000 124.0000 \n",
"SRX9845505 32707.0 171.201700 986.912211 0.0 3.0000 27.522 113.6820 \n",
"SRX9845531 32707.0 206.463199 1131.480546 0.0 2.0000 23.721 125.0000 \n",
"SRX9845510 32707.0 165.741786 1038.359313 0.0 3.0000 28.000 119.0000 \n",
"SRX9845535 32707.0 199.503417 1311.955681 0.0 3.0000 28.601 118.9225 \n",
"SRX9845513 32707.0 191.538372 1232.982014 0.0 4.0000 30.000 121.0000 \n",
"SRX9845512 32707.0 131.001250 721.962074 0.0 2.0000 20.700 93.9435 \n",
"SRX9845525 32707.0 230.778746 1613.978983 0.0 3.0000 30.670 139.0000 \n",
"SRX9845521 32707.0 210.007663 1196.141346 0.0 3.0000 31.306 134.0000 \n",
"SRX9845500 32707.0 148.738319 959.965734 0.0 1.0000 18.320 88.0000 \n",
"SRX9845511 32707.0 129.222067 688.428395 0.0 2.0000 20.000 90.0000 \n",
"\n",
" max \n",
"SRX9845534 91474.000 \n",
"SRX9845541 85682.000 \n",
"SRX9845526 87475.000 \n",
"SRX9845530 137044.000 \n",
"SRX9845543 69707.000 \n",
"SRX9845527 147753.000 \n",
"SRX9845506 112650.000 \n",
"SRX9845518 86589.000 \n",
"SRX9845501 86268.000 \n",
"SRX9845536 100197.000 \n",
"SRX9845529 120295.000 \n",
"SRX9845524 164959.000 \n",
"SRX9845507 102536.000 \n",
"SRX9845540 84558.000 \n",
"SRX9845517 82089.000 \n",
"SRX9845532 124108.000 \n",
"SRX9845503 83942.000 \n",
"SRX9845508 101989.000 \n",
"SRX9845537 112791.000 \n",
"SRX9845520 108452.000 \n",
"SRX9845523 71338.000 \n",
"SRX9845528 116776.000 \n",
"SRX9845516 120823.000 \n",
"SRX9845514 192241.000 \n",
"SRX9845522 112666.000 \n",
"SRX9845502 80409.000 \n",
"SRX9845509 74524.000 \n",
"SRX9845539 103742.000 \n",
"SRX9845542 97866.000 \n",
"SRX9845504 73309.000 \n",
"SRX9845538 168449.000 \n",
"SRX9845515 121237.000 \n",
"SRX9845519 77518.988 \n",
"SRX9845533 152243.000 \n",
"SRX9845505 97256.000 \n",
"SRX9845531 98612.000 \n",
"SRX9845510 93615.000 \n",
"SRX9845535 130487.000 \n",
"SRX9845513 113471.000 \n",
"SRX9845512 77585.000 \n",
"SRX9845525 145177.000 \n",
"SRX9845521 116651.000 \n",
"SRX9845500 95149.000 \n",
"SRX9845511 67648.000 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_summary.T"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "blind-restaurant",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mean | \n",
"
\n",
" \n",
" \n",
" \n",
" SRX9845534 | \n",
" 165.440053 | \n",
"
\n",
" \n",
" SRX9845541 | \n",
" 171.200147 | \n",
"
\n",
" \n",
" SRX9845526 | \n",
" 192.577787 | \n",
"
\n",
" \n",
" SRX9845530 | \n",
" 211.851658 | \n",
"
\n",
" \n",
" SRX9845543 | \n",
" 136.141405 | \n",
"
\n",
" \n",
" SRX9845527 | \n",
" 179.994930 | \n",
"
\n",
" \n",
" SRX9845506 | \n",
" 183.099091 | \n",
"
\n",
" \n",
" SRX9845518 | \n",
" 180.849091 | \n",
"
\n",
" \n",
" SRX9845501 | \n",
" 159.277506 | \n",
"
\n",
" \n",
" SRX9845536 | \n",
" 236.052346 | \n",
"
\n",
" \n",
" SRX9845529 | \n",
" 211.103546 | \n",
"
\n",
" \n",
" SRX9845524 | \n",
" 251.569822 | \n",
"
\n",
" \n",
" SRX9845507 | \n",
" 166.671554 | \n",
"
\n",
" \n",
" SRX9845540 | \n",
" 154.656215 | \n",
"
\n",
" \n",
" SRX9845517 | \n",
" 153.901693 | \n",
"
\n",
" \n",
" SRX9845532 | \n",
" 238.112836 | \n",
"
\n",
" \n",
" SRX9845503 | \n",
" 155.634818 | \n",
"
\n",
" \n",
" SRX9845508 | \n",
" 164.501283 | \n",
"
\n",
" \n",
" SRX9845537 | \n",
" 242.017452 | \n",
"
\n",
" \n",
" SRX9845520 | \n",
" 210.408155 | \n",
"
\n",
" \n",
" SRX9845523 | \n",
" 138.826165 | \n",
"
\n",
" \n",
" SRX9845528 | \n",
" 200.726127 | \n",
"
\n",
" \n",
" SRX9845516 | \n",
" 209.703970 | \n",
"
\n",
" \n",
" SRX9845514 | \n",
" 246.931643 | \n",
"
\n",
" \n",
" SRX9845522 | \n",
" 276.127039 | \n",
"
\n",
" \n",
" SRX9845502 | \n",
" 138.307777 | \n",
"
\n",
" \n",
" SRX9845509 | \n",
" 132.864739 | \n",
"
\n",
" \n",
" SRX9845539 | \n",
" 168.828154 | \n",
"
\n",
" \n",
" SRX9845542 | \n",
" 143.742371 | \n",
"
\n",
" \n",
" SRX9845504 | \n",
" 144.648181 | \n",
"
\n",
" \n",
" SRX9845538 | \n",
" 236.893808 | \n",
"
\n",
" \n",
" SRX9845515 | \n",
" 247.277537 | \n",
"
\n",
" \n",
" SRX9845519 | \n",
" 178.865050 | \n",
"
\n",
" \n",
" SRX9845533 | \n",
" 247.808722 | \n",
"
\n",
" \n",
" SRX9845505 | \n",
" 171.201700 | \n",
"
\n",
" \n",
" SRX9845531 | \n",
" 206.463199 | \n",
"
\n",
" \n",
" SRX9845510 | \n",
" 165.741786 | \n",
"
\n",
" \n",
" SRX9845535 | \n",
" 199.503417 | \n",
"
\n",
" \n",
" SRX9845513 | \n",
" 191.538372 | \n",
"
\n",
" \n",
" SRX9845512 | \n",
" 131.001250 | \n",
"
\n",
" \n",
" SRX9845525 | \n",
" 230.778746 | \n",
"
\n",
" \n",
" SRX9845521 | \n",
" 210.007663 | \n",
"
\n",
" \n",
" SRX9845500 | \n",
" 148.738319 | \n",
"
\n",
" \n",
" SRX9845511 | \n",
" 129.222067 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mean\n",
"SRX9845534 165.440053\n",
"SRX9845541 171.200147\n",
"SRX9845526 192.577787\n",
"SRX9845530 211.851658\n",
"SRX9845543 136.141405\n",
"SRX9845527 179.994930\n",
"SRX9845506 183.099091\n",
"SRX9845518 180.849091\n",
"SRX9845501 159.277506\n",
"SRX9845536 236.052346\n",
"SRX9845529 211.103546\n",
"SRX9845524 251.569822\n",
"SRX9845507 166.671554\n",
"SRX9845540 154.656215\n",
"SRX9845517 153.901693\n",
"SRX9845532 238.112836\n",
"SRX9845503 155.634818\n",
"SRX9845508 164.501283\n",
"SRX9845537 242.017452\n",
"SRX9845520 210.408155\n",
"SRX9845523 138.826165\n",
"SRX9845528 200.726127\n",
"SRX9845516 209.703970\n",
"SRX9845514 246.931643\n",
"SRX9845522 276.127039\n",
"SRX9845502 138.307777\n",
"SRX9845509 132.864739\n",
"SRX9845539 168.828154\n",
"SRX9845542 143.742371\n",
"SRX9845504 144.648181\n",
"SRX9845538 236.893808\n",
"SRX9845515 247.277537\n",
"SRX9845519 178.865050\n",
"SRX9845533 247.808722\n",
"SRX9845505 171.201700\n",
"SRX9845531 206.463199\n",
"SRX9845510 165.741786\n",
"SRX9845535 199.503417\n",
"SRX9845513 191.538372\n",
"SRX9845512 131.001250\n",
"SRX9845525 230.778746\n",
"SRX9845521 210.007663\n",
"SRX9845500 148.738319\n",
"SRX9845511 129.222067"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_summary.T[['mean']]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "ranging-teddy",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SRX9845534 | \n",
" SRX9845541 | \n",
" SRX9845526 | \n",
" SRX9845530 | \n",
" SRX9845543 | \n",
" SRX9845527 | \n",
" SRX9845506 | \n",
" SRX9845518 | \n",
" SRX9845501 | \n",
" SRX9845536 | \n",
" ... | \n",
" SRX9845505 | \n",
" SRX9845531 | \n",
" SRX9845510 | \n",
" SRX9845535 | \n",
" SRX9845513 | \n",
" SRX9845512 | \n",
" SRX9845525 | \n",
" SRX9845521 | \n",
" SRX9845500 | \n",
" SRX9845511 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.136598 | \n",
" 0.076920 | \n",
" 0.068847 | \n",
" 0.000000 | \n",
" 0.223687 | \n",
" 0.178859 | \n",
" 0.176366 | \n",
" 0.266068 | \n",
" 0.076261 | \n",
" 0.280275 | \n",
" ... | \n",
" 0.444962 | \n",
" 0.240658 | \n",
" 0.073524 | \n",
" 0.429071 | \n",
" 0.414348 | \n",
" 0.166365 | \n",
" 0.529522 | \n",
" 0.157385 | \n",
" 0.081188 | \n",
" 0.338823 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.776473 | \n",
" 0.416191 | \n",
" 1.061460 | \n",
" 0.361517 | \n",
" 0.593234 | \n",
" 0.807178 | \n",
" 0.921544 | \n",
" 0.224337 | \n",
" 0.292209 | \n",
" 0.000000 | \n",
" ... | \n",
" 0.186324 | \n",
" 1.336982 | \n",
" 0.481325 | \n",
" 0.748001 | \n",
" 1.076306 | \n",
" 0.457447 | \n",
" 0.811491 | \n",
" 0.157385 | \n",
" 1.246752 | \n",
" 0.634725 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.976184 | \n",
" 1.615099 | \n",
" 0.880931 | \n",
" 0.736256 | \n",
" 1.861310 | \n",
" 1.461123 | \n",
" 1.670401 | \n",
" 1.630880 | \n",
" 0.500412 | \n",
" 1.236466 | \n",
" ... | \n",
" 1.511933 | \n",
" 1.548389 | \n",
" 0.830142 | \n",
" 1.288107 | \n",
" 1.533238 | \n",
" 1.193131 | \n",
" 1.257932 | \n",
" 1.245244 | \n",
" 0.904844 | \n",
" 0.868448 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.122906 | \n",
" 0.437891 | \n",
" 0.414129 | \n",
" 0.452116 | \n",
" 0.446635 | \n",
" 0.273111 | \n",
" 0.410352 | \n",
" 0.481261 | \n",
" 0.532831 | \n",
" 0.668347 | \n",
" ... | \n",
" 0.301321 | \n",
" 0.332836 | \n",
" 0.581791 | \n",
" 0.495957 | \n",
" 0.461756 | \n",
" 0.486370 | \n",
" 0.493621 | \n",
" 0.445808 | \n",
" 0.491850 | \n",
" 0.397706 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.988917 | \n",
" 1.067117 | \n",
" 1.253353 | \n",
" 1.269243 | \n",
" 1.214500 | \n",
" 1.173312 | \n",
" 0.945427 | \n",
" 1.195596 | \n",
" 1.060592 | \n",
" 1.361088 | \n",
" ... | \n",
" 1.028970 | \n",
" 1.300061 | \n",
" 1.039837 | \n",
" 1.292161 | \n",
" 1.426175 | \n",
" 1.084310 | \n",
" 1.130240 | \n",
" 1.032947 | \n",
" 1.160286 | \n",
" 0.942124 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 44 columns
\n",
"
"
],
"text/plain": [
" SRX9845534 SRX9845541 SRX9845526 SRX9845530 SRX9845543 SRX9845527 \\\n",
"0 0.136598 0.076920 0.068847 0.000000 0.223687 0.178859 \n",
"1 1.776473 0.416191 1.061460 0.361517 0.593234 0.807178 \n",
"4 0.976184 1.615099 0.880931 0.736256 1.861310 1.461123 \n",
"5 0.122906 0.437891 0.414129 0.452116 0.446635 0.273111 \n",
"6 0.988917 1.067117 1.253353 1.269243 1.214500 1.173312 \n",
"\n",
" SRX9845506 SRX9845518 SRX9845501 SRX9845536 ... SRX9845505 \\\n",
"0 0.176366 0.266068 0.076261 0.280275 ... 0.444962 \n",
"1 0.921544 0.224337 0.292209 0.000000 ... 0.186324 \n",
"4 1.670401 1.630880 0.500412 1.236466 ... 1.511933 \n",
"5 0.410352 0.481261 0.532831 0.668347 ... 0.301321 \n",
"6 0.945427 1.195596 1.060592 1.361088 ... 1.028970 \n",
"\n",
" SRX9845531 SRX9845510 SRX9845535 SRX9845513 SRX9845512 SRX9845525 \\\n",
"0 0.240658 0.073524 0.429071 0.414348 0.166365 0.529522 \n",
"1 1.336982 0.481325 0.748001 1.076306 0.457447 0.811491 \n",
"4 1.548389 0.830142 1.288107 1.533238 1.193131 1.257932 \n",
"5 0.332836 0.581791 0.495957 0.461756 0.486370 0.493621 \n",
"6 1.300061 1.039837 1.292161 1.426175 1.084310 1.130240 \n",
"\n",
" SRX9845521 SRX9845500 SRX9845511 \n",
"0 0.157385 0.081188 0.338823 \n",
"1 0.157385 1.246752 0.634725 \n",
"4 1.245244 0.904844 0.868448 \n",
"5 0.445808 0.491850 0.397706 \n",
"6 1.032947 1.160286 0.942124 \n",
"\n",
"[5 rows x 44 columns]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Log-transform CPM data for Combat\n",
"log_cpm_data = np.log10(cpm_counts.iloc[:, 1:] + 1)\n",
"log_cpm_data.head()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "aquatic-species",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SRX9845534 2017\n",
"SRX9845541 2017\n",
"SRX9845526 2015\n",
"SRX9845530 2015\n",
"SRX9845543 2017\n",
"Name: batch, dtype: int64"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Extract batch information\n",
"batch = metadata_subset.set_index('Sample').loc[log_cpm_data.columns, 'batch']\n",
"batch.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "opponent-tomorrow",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'(slice(None, 2, None), slice(None, None, None))' is an invalid key",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Apply Combat's fit_transform method\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m combat_adjusted_data = combat.fit_transform(\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlog_cpm_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# Transpose: genes as rows, samples as columns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch\u001b[0m \u001b[0;31m# Batch information\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/anaconda3/lib/python3.8/site-packages/pycombat/pycombat.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, Y, b, X, C)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 312\u001b[0m \"\"\"\n\u001b[0;32m--> 313\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 314\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_for_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/anaconda3/lib/python3.8/site-packages/pycombat/pycombat.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, Y, b, X, C)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;31m# Find intercepts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m alpha_hat = np.matmul(sample_per_batch/float(n_samples),\n\u001b[0;32m--> 175\u001b[0;31m beta_hat[:n_batch, :])\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malpha_hat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;31m# Find slopes for covariates/effects\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3022\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3023\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3024\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3025\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3026\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3078\u001b[0m \u001b[0mcasted_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3080\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3081\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3082\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: '(slice(None, 2, None), slice(None, None, None))' is an invalid key"
]
}
],
"source": [
"# Initialize Combat with desired mode ('p' for parametric or 'np' for non-parametric)\n",
"combat = Combat(mode='p')\n",
"\n",
"# Apply Combat's fit_transform method\n",
"combat_adjusted_data = combat.fit_transform(\n",
" Y=log_cpm_data.T, # Transpose: genes as rows, samples as columns\n",
" b=batch # Batch information\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "individual-tourism",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape of log_cpm_data.T: (44, 32707)\n",
"Length of batch: 44\n",
"Batch index: Index(['SRX9845534', 'SRX9845541', 'SRX9845526', 'SRX9845530', 'SRX9845543',\n",
" 'SRX9845527', 'SRX9845506', 'SRX9845518', 'SRX9845501', 'SRX9845536',\n",
" 'SRX9845529', 'SRX9845524', 'SRX9845507', 'SRX9845540', 'SRX9845517',\n",
" 'SRX9845532', 'SRX9845503', 'SRX9845508', 'SRX9845537', 'SRX9845520',\n",
" 'SRX9845523', 'SRX9845528', 'SRX9845516', 'SRX9845514', 'SRX9845522',\n",
" 'SRX9845502', 'SRX9845509', 'SRX9845539', 'SRX9845542', 'SRX9845504',\n",
" 'SRX9845538', 'SRX9845515', 'SRX9845519', 'SRX9845533', 'SRX9845505',\n",
" 'SRX9845531', 'SRX9845510', 'SRX9845535', 'SRX9845513', 'SRX9845512',\n",
" 'SRX9845525', 'SRX9845521', 'SRX9845500', 'SRX9845511'],\n",
" dtype='object')\n",
"log_cpm_data.T columns: Int64Index([ 0, 1, 4, 5, 6, 7, 9, 10, 11,\n",
" 12,\n",
" ...\n",
" 38253, 38254, 38256, 38257, 38258, 38259, 38260, 38261, 38262,\n",
" 38263],\n",
" dtype='int64', length=32707)\n"
]
}
],
"source": [
"print(\"Shape of log_cpm_data.T:\", log_cpm_data.T.shape)\n",
"print(\"Length of batch:\", len(batch))\n",
"print(\"Batch index:\", batch.index)\n",
"print(\"log_cpm_data.T columns:\", log_cpm_data.T.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "threaded-station",
"metadata": {},
"outputs": [],
"source": [
"# Ensure `log_cpm_data.T` uses sample IDs as indices\n",
"log_cpm_data.T.index = batch.index # Align sample IDs\n",
"\n",
"# Confirm alignment\n",
"assert all(log_cpm_data.T.index == batch.index), \"Indices of log_cpm_data and batch do not match!\"\n"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "initial-contest",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/shellytrigg/bin/anaconda3/lib/python3.8/site-packages/pycombat/pycombat.py:79: RuntimeWarning: divide by zero encountered in true_divide\n",
" (abs(del_sq_post - del_sq_prior) / del_sq_prior).max())\n"
]
}
],
"source": [
"# Ensure `log_cpm_data.T` and `batch` are NumPy arrays\n",
"log_cpm_data_array = log_cpm_data.T.values\n",
"batch_array = batch.values\n",
"\n",
"# Apply Combat with parametric mode\n",
"combat = Combat(mode='p')\n",
"combat_adjusted_data = combat.fit_transform(\n",
" Y=log_cpm_data_array, # Use NumPy array\n",
" b=batch_array # Use NumPy array\n",
")\n",
"\n",
"# Convert adjusted data back to a DataFrame\n",
"combat_adjusted_cpm = pd.DataFrame(\n",
" combat_adjusted_data.T, # Transpose back to original format\n",
" index=cpm_counts['gene_id'], \n",
" columns=cpm_counts.columns[1:]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "accessible-security",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SRX9845534 | \n",
" SRX9845541 | \n",
" SRX9845526 | \n",
" SRX9845530 | \n",
" SRX9845543 | \n",
" SRX9845527 | \n",
" SRX9845506 | \n",
" SRX9845518 | \n",
" SRX9845501 | \n",
" SRX9845536 | \n",
" ... | \n",
" SRX9845505 | \n",
" SRX9845531 | \n",
" SRX9845510 | \n",
" SRX9845535 | \n",
" SRX9845513 | \n",
" SRX9845512 | \n",
" SRX9845525 | \n",
" SRX9845521 | \n",
" SRX9845500 | \n",
" SRX9845511 | \n",
"
\n",
" \n",
" gene_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" LOC111099029 | \n",
" 0.167813 | \n",
" 0.101085 | \n",
" 0.057031 | \n",
" -0.005956 | \n",
" 0.265190 | \n",
" 0.157678 | \n",
" 0.212279 | \n",
" 0.237464 | \n",
" 0.100347 | \n",
" 0.250462 | \n",
" ... | \n",
" 0.512605 | \n",
" 0.214217 | \n",
" 0.097288 | \n",
" 0.386592 | \n",
" 0.373123 | \n",
" 0.201097 | \n",
" 0.478493 | \n",
" 0.138032 | \n",
" 0.105857 | \n",
" 0.393927 | \n",
"
\n",
" \n",
" LOC111099030 | \n",
" 1.641251 | \n",
" 0.391587 | \n",
" 1.120833 | \n",
" 0.402864 | \n",
" 0.554232 | \n",
" 0.860002 | \n",
" 0.855845 | \n",
" 0.262151 | \n",
" 0.277687 | \n",
" 0.032037 | \n",
" ... | \n",
" 0.180413 | \n",
" 1.403450 | \n",
" 0.451424 | \n",
" 0.799301 | \n",
" 1.136061 | \n",
" 0.429488 | \n",
" 0.864426 | \n",
" 0.193475 | \n",
" 1.154606 | \n",
" 0.592350 | \n",
"
\n",
" \n",
" LOC111099033 | \n",
" 0.981560 | \n",
" 1.634338 | \n",
" 0.887260 | \n",
" 0.748817 | \n",
" 1.885891 | \n",
" 1.442460 | \n",
" 1.690840 | \n",
" 1.604905 | \n",
" 0.495465 | \n",
" 1.227480 | \n",
" ... | \n",
" 1.528934 | \n",
" 1.525967 | \n",
" 0.832350 | \n",
" 1.276897 | \n",
" 1.511469 | \n",
" 1.203214 | \n",
" 1.248022 | \n",
" 1.235880 | \n",
" 0.908672 | \n",
" 0.871487 | \n",
"
\n",
" \n",
" LOC111099034 | \n",
" 0.102217 | \n",
" 0.417978 | \n",
" 0.430214 | \n",
" 0.466941 | \n",
" 0.426744 | \n",
" 0.293874 | \n",
" 0.390371 | \n",
" 0.495119 | \n",
" 0.513152 | \n",
" 0.675999 | \n",
" ... | \n",
" 0.281071 | \n",
" 0.351617 | \n",
" 0.562233 | \n",
" 0.509328 | \n",
" 0.476261 | \n",
" 0.466577 | \n",
" 0.507069 | \n",
" 0.460842 | \n",
" 0.472070 | \n",
" 0.377694 | \n",
"
\n",
" \n",
" LOC111099035 | \n",
" 0.999655 | \n",
" 1.086512 | \n",
" 1.227465 | \n",
" 1.242055 | \n",
" 1.250210 | \n",
" 1.153973 | \n",
" 0.951351 | \n",
" 1.174433 | \n",
" 1.079264 | \n",
" 1.326385 | \n",
" ... | \n",
" 1.044142 | \n",
" 1.270351 | \n",
" 1.056212 | \n",
" 1.263098 | \n",
" 1.386146 | \n",
" 1.105608 | \n",
" 1.114425 | \n",
" 1.025093 | \n",
" 1.189994 | \n",
" 0.947682 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 44 columns
\n",
"
"
],
"text/plain": [
" SRX9845534 SRX9845541 SRX9845526 SRX9845530 SRX9845543 \\\n",
"gene_id \n",
"LOC111099029 0.167813 0.101085 0.057031 -0.005956 0.265190 \n",
"LOC111099030 1.641251 0.391587 1.120833 0.402864 0.554232 \n",
"LOC111099033 0.981560 1.634338 0.887260 0.748817 1.885891 \n",
"LOC111099034 0.102217 0.417978 0.430214 0.466941 0.426744 \n",
"LOC111099035 0.999655 1.086512 1.227465 1.242055 1.250210 \n",
"\n",
" SRX9845527 SRX9845506 SRX9845518 SRX9845501 SRX9845536 ... \\\n",
"gene_id ... \n",
"LOC111099029 0.157678 0.212279 0.237464 0.100347 0.250462 ... \n",
"LOC111099030 0.860002 0.855845 0.262151 0.277687 0.032037 ... \n",
"LOC111099033 1.442460 1.690840 1.604905 0.495465 1.227480 ... \n",
"LOC111099034 0.293874 0.390371 0.495119 0.513152 0.675999 ... \n",
"LOC111099035 1.153973 0.951351 1.174433 1.079264 1.326385 ... \n",
"\n",
" SRX9845505 SRX9845531 SRX9845510 SRX9845535 SRX9845513 \\\n",
"gene_id \n",
"LOC111099029 0.512605 0.214217 0.097288 0.386592 0.373123 \n",
"LOC111099030 0.180413 1.403450 0.451424 0.799301 1.136061 \n",
"LOC111099033 1.528934 1.525967 0.832350 1.276897 1.511469 \n",
"LOC111099034 0.281071 0.351617 0.562233 0.509328 0.476261 \n",
"LOC111099035 1.044142 1.270351 1.056212 1.263098 1.386146 \n",
"\n",
" SRX9845512 SRX9845525 SRX9845521 SRX9845500 SRX9845511 \n",
"gene_id \n",
"LOC111099029 0.201097 0.478493 0.138032 0.105857 0.393927 \n",
"LOC111099030 0.429488 0.864426 0.193475 1.154606 0.592350 \n",
"LOC111099033 1.203214 1.248022 1.235880 0.908672 0.871487 \n",
"LOC111099034 0.466577 0.507069 0.460842 0.472070 0.377694 \n",
"LOC111099035 1.105608 1.114425 1.025093 1.189994 0.947682 \n",
"\n",
"[5 rows x 44 columns]"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combat_adjusted_cpm.head()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "individual-therapy",
"metadata": {},
"outputs": [],
"source": [
"# Remove zero-variance genes\n",
"non_zero_variance_genes = log_cpm_data.var(axis=0) > 0\n",
"log_cpm_data_filtered = log_cpm_data.loc[:, non_zero_variance_genes]"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "hairy-blues",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/shellytrigg/bin/anaconda3/lib/python3.8/site-packages/pycombat/pycombat.py:79: RuntimeWarning: divide by zero encountered in true_divide\n",
" (abs(del_sq_post - del_sq_prior) / del_sq_prior).max())\n"
]
}
],
"source": [
"combat_adjusted_data = combat.fit_transform(\n",
" Y=log_cpm_data_filtered.T.values,\n",
" b=batch.values\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "sharp-republic",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
}
],
"source": [
"# Check for NaN or infinite values in the adjusted data\n",
"print(np.isfinite(combat_adjusted_data).all()) # Should return True"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "primary-telescope",
"metadata": {},
"outputs": [],
"source": [
"# Remove low-variance genes\n",
"low_variance_threshold = 1e-6 # Define a small threshold\n",
"non_low_variance_genes = log_cpm_data.var(axis=0) > low_variance_threshold\n",
"log_cpm_data_filtered = log_cpm_data.loc[:, non_low_variance_genes]\n"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "thick-skirt",
"metadata": {},
"outputs": [],
"source": [
"# Clip values to avoid numerical instability\n",
"log_cpm_data_clipped = log_cpm_data_filtered.clip(lower=1e-6)\n"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "popular-healthcare",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/shellytrigg/bin/anaconda3/lib/python3.8/site-packages/pycombat/pycombat.py:79: RuntimeWarning: divide by zero encountered in true_divide\n",
" (abs(del_sq_post - del_sq_prior) / del_sq_prior).max())\n"
]
}
],
"source": [
"combat_adjusted_data = combat.fit_transform(\n",
" Y=log_cpm_data_clipped.T.values,\n",
" b=batch.values\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "molecular-health",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Perform PCA on Combat-adjusted data\n",
"scaler = StandardScaler()\n",
"scaled_data = scaler.fit_transform(combat_adjusted_cpm.T)\n",
"pca = PCA(n_components=2)\n",
"pca_results = pca.fit_transform(scaled_data)\n",
"\n",
"# Create a PCA DataFrame\n",
"pca_df = pd.DataFrame(pca_results, columns=['PC1', 'PC2'])\n",
"pca_df['Sample'] = combat_adjusted_cpm.columns\n",
"pca_df = pd.merge(pca_df, metadata_subset, left_on='Sample', right_on='Sample')\n",
"\n",
"# Plot PCA after Combat adjustment\n",
"plt.figure(figsize=(12, 10))\n",
"sns.scatterplot(\n",
" data=pca_df,\n",
" x='PC1', y='PC2',\n",
" hue='batch', style='treatment',\n",
" s=100, palette='Set2'\n",
")\n",
"plt.title('PCA After Combat Adjustment')\n",
"plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% Variance)')\n",
"plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% Variance)')\n",
"plt.legend(title='Batch', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
"plt.grid()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "severe-macedonia",
"metadata": {},
"source": [
"If the PCA plot looks the same before and after batch effect correction, this could indicate one of the following:\n",
"\n",
"Possible Explanations\n",
"\n",
"Insufficient Batch Adjustment: Combat may not have fully corrected the batch effects.\n",
"This can happen if batch is highly confounded with other factors (e.g., treatment or time point).\n",
"\n",
"Low Impact of Batch: Batch effects might not have been a major source of variance in the data to begin with.PCA often highlights dominant sources of variation, which may be unrelated to batch.\n",
"\n",
"Residual Batch Effects in High Variance Genes:The Combat adjustment could have reduced batch effects in most genes, but residual effects might remain in genes with the highest variance.\n",
"\n",
"Biological Signals are Confounded with Batch: If batch and biological variables (e.g., treatment or time point) are correlated, Combat may struggle to disentangle them.\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}