{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Stabilized ICA for transcriptomic data \n", "\n", "In this jupyter notebook, we propose a short example for the use of our stabilized ICA algorithm. We study the extraction of ICA components for a transcriptomic data set gathering NSCLC patients. \n", "\n", "**Note :** To run this jupyter notebook, you first need to extract the data set \"data.csv\" from the zip file \"data.zip\" (same directory)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Load the data set\n", "\n", "This data set was extraced from [\"Defining the Biological Basis of Radiomic Phenotypes in Lung Cancer\" Grossman et al. 2017](https://elifesciences.org/articles/23421). \n", "It contains the expression of 21,766 unique genes for 269 patients with Non-small cell lung cancer (NSCLC) treated at the H. Lee Moffitt Cancer Center, Tampa, Florida, USA. Gene expression values were measured on a custom Rosetta/Merck Affymetrix 2.0 microarray chipset and normalized with the robust multi-array average (RMA) algorithm." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | 3643 | \n", "84263 | \n", "7171 | \n", "2934 | \n", "11052 | \n", "1241 | \n", "6453 | \n", "57541 | \n", "9349 | \n", "11165 | \n", "... | \n", "643669 | \n", "1572 | \n", "8551 | \n", "26784 | \n", "26783 | \n", "26782 | \n", "26779 | \n", "26778 | \n", "26777 | \n", "100132941 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RadioGenomic-017 | \n", "5.205151 | \n", "7.097989 | \n", "9.559617 | \n", "8.396808 | \n", "7.603719 | \n", "7.990605 | \n", "10.044401 | \n", "9.054930 | \n", "7.383169 | \n", "8.177010 | \n", "... | \n", "6.419273 | \n", "3.809826 | \n", "6.507880 | \n", "6.572121 | \n", "5.400848 | \n", "5.951391 | \n", "3.381860 | \n", "9.825584 | \n", "2.905091 | \n", "5.622438 | \n", "
| RadioGenomic-055 | \n", "5.615738 | \n", "6.585052 | \n", "9.777869 | \n", "9.082415 | \n", "8.639498 | \n", "6.781274 | \n", "9.541826 | \n", "8.866110 | \n", "6.422702 | \n", "7.196294 | \n", "... | \n", "5.753828 | \n", "4.186127 | \n", "6.821582 | \n", "7.031406 | \n", "4.852417 | \n", "6.140850 | \n", "2.629760 | \n", "9.005145 | \n", "3.366466 | \n", "5.495330 | \n", "
| RadioGenomic-227 | \n", "5.679276 | \n", "7.747854 | \n", "10.648704 | \n", "9.127985 | \n", "7.369421 | \n", "7.203773 | \n", "8.972255 | \n", "8.328371 | \n", "7.269232 | \n", "7.449183 | \n", "... | \n", "5.666999 | \n", "4.316130 | \n", "6.637855 | \n", "6.248824 | \n", "4.664228 | \n", "5.767970 | \n", "2.911470 | \n", "8.674466 | \n", "3.337194 | \n", "6.308605 | \n", "
| RadioGenomic-222 | \n", "5.317341 | \n", "7.196276 | \n", "10.949771 | \n", "8.098896 | \n", "7.639882 | \n", "7.971876 | \n", "10.159637 | \n", "8.667702 | \n", "8.474250 | \n", "7.271477 | \n", "... | \n", "5.531060 | \n", "3.403776 | \n", "7.059419 | \n", "6.201873 | \n", "4.690005 | \n", "6.256286 | \n", "4.119688 | \n", "9.099659 | \n", "3.181781 | \n", "5.740033 | \n", "
| RadioGenomic-212 | \n", "7.196904 | \n", "9.346492 | \n", "9.673778 | \n", "9.358636 | \n", "8.741693 | \n", "7.616498 | \n", "10.376653 | \n", "8.701461 | \n", "6.601991 | \n", "7.344651 | \n", "... | \n", "5.519642 | \n", "3.796049 | \n", "7.332635 | \n", "6.050121 | \n", "4.898523 | \n", "6.537895 | \n", "3.600895 | \n", "8.792510 | \n", "2.945391 | \n", "5.835411 | \n", "
5 rows × 21766 columns
\n", "| \n", " | 3643 | \n", "84263 | \n", "7171 | \n", "2934 | \n", "11052 | \n", "1241 | \n", "6453 | \n", "57541 | \n", "9349 | \n", "11165 | \n", "... | \n", "643669 | \n", "1572 | \n", "8551 | \n", "26784 | \n", "26783 | \n", "26782 | \n", "26779 | \n", "26778 | \n", "26777 | \n", "100132941 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| metagene 0 | \n", "-0.502359 | \n", "0.063330 | \n", "0.619226 | \n", "-0.584321 | \n", "-0.034392 | \n", "-0.645843 | \n", "0.018026 | \n", "0.165158 | \n", "-0.390837 | \n", "0.026501 | \n", "... | \n", "0.211179 | \n", "-0.369237 | \n", "-0.975147 | \n", "-0.142125 | \n", "0.339079 | \n", "0.118280 | \n", "-0.761259 | \n", "-0.183313 | \n", "-0.006010 | \n", "-0.323200 | \n", "
| metagene 1 | \n", "0.481376 | \n", "-0.085962 | \n", "-0.126218 | \n", "0.490215 | \n", "0.242829 | \n", "0.751853 | \n", "0.465473 | \n", "-0.415335 | \n", "0.587996 | \n", "0.014006 | \n", "... | \n", "-0.362256 | \n", "3.178655 | \n", "-0.159200 | \n", "-0.211356 | \n", "-0.387652 | \n", "-0.137837 | \n", "-0.474530 | \n", "0.204819 | \n", "0.312316 | \n", "-0.018258 | \n", "
| metagene 2 | \n", "-0.813705 | \n", "-0.287146 | \n", "0.135293 | \n", "0.005913 | \n", "1.361459 | \n", "0.175264 | \n", "-0.335616 | \n", "-0.120486 | \n", "-2.524144 | \n", "0.933776 | \n", "... | \n", "-0.640554 | \n", "-0.777672 | \n", "-0.313629 | \n", "0.010074 | \n", "0.255665 | \n", "1.612827 | \n", "0.519558 | \n", "0.661906 | \n", "0.016296 | \n", "-0.335104 | \n", "
| metagene 3 | \n", "0.029950 | \n", "0.171725 | \n", "0.607405 | \n", "1.403389 | \n", "-0.513840 | \n", "2.216392 | \n", "3.532994 | \n", "0.286914 | \n", "-1.342845 | \n", "1.290781 | \n", "... | \n", "0.489110 | \n", "0.356925 | \n", "0.674017 | \n", "-1.592262 | \n", "-1.058137 | \n", "0.383702 | \n", "0.648317 | \n", "-0.687954 | \n", "-0.364239 | \n", "-0.733333 | \n", "
| metagene 4 | \n", "1.338965 | \n", "-0.540206 | \n", "-0.242459 | \n", "-1.213108 | \n", "0.724185 | \n", "4.785730 | \n", "0.466036 | \n", "-0.455820 | \n", "-0.756496 | \n", "-0.013456 | \n", "... | \n", "0.950401 | \n", "0.123224 | \n", "0.326030 | \n", "2.161343 | \n", "0.678265 | \n", "1.191926 | \n", "1.467889 | \n", "1.275098 | \n", "0.168216 | \n", "-0.796021 | \n", "
5 rows × 21766 columns
\n", "| \n", " | Pathway identifier | \n", "Pathway name | \n", "#Entities found | \n", "#Entities total | \n", "Entities ratio | \n", "Entities pValue | \n", "Entities FDR | \n", "#Reactions found | \n", "#Reactions total | \n", "Reactions ratio | \n", "Species identifier | \n", "Species name | \n", "Submitted entities found | \n", "Mapped entities | \n", "Found reaction identifiers | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | \n", "R-HSA-1474228 | \n", "Degradation of the extracellular matrix | \n", "40.0 | \n", "148.0 | \n", "0.009799 | \n", "0.0 | \n", "0.0 | \n", "87.0 | \n", "105.0 | \n", "0.007612 | \n", "9606 | \n", "Homo sapiens | \n", "50509;1284;1282;1281;176;1634;1513;80781;2006;... | \n", "\n", " | R-HSA-8940554;R-HSA-1602458;R-HSA-1474196;R-HS... | \n", "
| 3 | \n", "R-HSA-2022090 | \n", "Assembly of collagen fibrils and other multime... | \n", "26.0 | \n", "67.0 | \n", "0.004436 | \n", "0.0 | \n", "0.0 | \n", "21.0 | \n", "26.0 | \n", "0.001885 | \n", "9606 | \n", "Homo sapiens | \n", "50509;1284;1282;1281;80781;1278;1277;5118;7837... | \n", "\n", " | R-HSA-2395302;R-HSA-2022141;R-HSA-2299620;R-HS... | \n", "
| 4 | \n", "R-HSA-1442490 | \n", "Collagen degradation | \n", "29.0 | \n", "69.0 | \n", "0.004568 | \n", "0.0 | \n", "0.0 | \n", "27.0 | \n", "34.0 | \n", "0.002465 | \n", "9606 | \n", "Homo sapiens | \n", "50509;1284;1282;1281;1634;1513;80781;1278;1277... | \n", "\n", " | R-HSA-1564117;R-HSA-1474196;R-HSA-1474197;R-HS... | \n", "
| \n", " | Category | \n", "ID | \n", "Name | \n", "PValue | \n", "QValueFDRBH | \n", "QValueFDRBY | \n", "QValueBonferroni | \n", "TotalGenes | \n", "GenesInTerm | \n", "GenesInQuery | \n", "GenesInTermInQuery | \n", "Source | \n", "URL | \n", "Gene_Symbol | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "GeneOntologyMolecularFunction | \n", "GO:0045296 | \n", "cadherin binding | \n", "5.468367e-10 | \n", "3.253678e-07 | \n", "0.000002 | \n", "3.253678e-07 | \n", "19869 | \n", "341 | \n", "408 | \n", "28 | \n", "\n", " | \n", " | VAPB,ADD1,AHNAK,DHX29,VCL,LIMA1,EPN2,USO1,PKN2... | \n", "
| 1 | \n", "GeneOntologyMolecularFunction | \n", "GO:0003779 | \n", "actin binding | \n", "4.007864e-06 | \n", "1.192340e-03 | \n", "0.008307 | \n", "2.384679e-03 | \n", "19869 | \n", "492 | \n", "408 | \n", "27 | \n", "\n", " | \n", " | MAP1B,ADD1,LMOD1,PLS3,VCL,FLII,LIMA1,ENC1,YWHA... | \n", "
| 2 | \n", "GeneOntologyMolecularFunction | \n", "GO:0042393 | \n", "histone binding | \n", "6.316044e-06 | \n", "1.252682e-03 | \n", "0.008727 | \n", "3.758046e-03 | \n", "19869 | \n", "256 | \n", "408 | \n", "18 | \n", "\n", " | \n", " | CHD1,EZH1,PHF2,SET,BRD9,SMARCC1,SART3,ZMYND11,... | \n", "
| \n", " | inputs | \n", "entrezgene | \n", "notfound | \n", "
|---|---|---|---|
| metagene 0 | \n", "[6192, 9086, 8653, 8284, 8287, 246126, 7404, 6... | \n", "None | \n", "None | \n", "
| metagene 1 | \n", "[128602, 92747, 399949, 127003, 144448, 133690... | \n", "None | \n", "None | \n", "
| metagene 2 | \n", "[388468, 400968, 29044, 228, 160857, 83417, 64... | \n", "None | \n", "None | \n", "
| metagene 3 | \n", "[79832, 23347, 7403, 659, 83448, 1038, 163590,... | \n", "None | \n", "None | \n", "
| metagene 4 | \n", "[283131, 84719, 79923, 29896, 100642175, 27099... | \n", "None | \n", "None | \n", "
| metagene 5 | \n", "[8685, 366, 1118, 81501, 6348, 126014, 597, 11... | \n", "None | \n", "None | \n", "
| metagene 6 | \n", "[1301, 131578, 6423, 8483, 26585, 84624, 22801... | \n", "None | \n", "None | \n", "
| metagene 7 | \n", "[9787, 55872, 83540, 259266, 4751, 1063, 55635... | \n", "None | \n", "None | \n", "
| metagene 8 | \n", "[23440, 7200, 80309, 4842, 10842, 5354, 6857, ... | \n", "None | \n", "None | \n", "
| metagene 9 | \n", "[260436, 931, 10563, 83416, 3507, 973, 1380, 6... | \n", "None | \n", "None | \n", "