{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# Convert a tabular peptides dataset into their corresponding proteins\n", "In this example, we: \n", " 1. load a tabular dataset of peptide intensities\n", " 2. convert it into their corresponding proteins, either as peptide counts or\n", " 3. convert to proteins calculating the sum of peptide intensities.\n", " 4. export the data into Pandas data frames and join for further processing. \n", "\n", "First, load the requried libraries." ], "id": "676456ef84bcb799" }, { "metadata": { "ExecuteTime": { "end_time": "2024-11-20T13:17:45.974668Z", "start_time": "2024-11-20T13:17:45.134063Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "import omicspylib as opl\n", "from omicspylib import PeptidesDataset\n", "print(f'omicspylib version: {opl.__version__}')" ], "id": "7044ebd2fa5d450e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "omicspylib version: 0.0.7\n" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "markdown", "source": [ "Then prepare your data as a Pandas data frame. You need to specify the column name containing the peptide identifier (`peptide_id` in this example), the protein identifier required to perform the group by operation (`protein_id` in this example) and the column names for all experimental conditions like below.\n", "\n", "It is expected that you perform any cleaning required for your use case (e.g. removal of reverse hits, contaminants, modified peptides, or shared peptides across proteins etc). " ], "id": "68aecb5478a22945" }, { "metadata": { "ExecuteTime": { "end_time": "2024-11-20T13:17:46.074440Z", "start_time": "2024-11-20T13:17:46.057584Z" } }, "cell_type": "code", "source": [ "data_df = pd.read_csv('data/peptides_dataset.tsv', sep='\\t')\n", "\n", "config = {\n", " 'id_col': 'peptide_id',\n", " 'conditions': {\n", " 'c1': ['c1_rep1', 'c1_rep2', 'c1_rep3', 'c1_rep4', 'c1_rep5'],\n", " 'c2': ['c2_rep1', 'c2_rep2', 'c2_rep3', 'c2_rep4', 'c2_rep5'],\n", " 'c3': ['c3_rep1', 'c3_rep2', 'c3_rep3', 'c3_rep4', 'c3_rep5'],\n", " },\n", " 'protein_id_col': 'protein_id',\n", "}\n", "data_df.head(3)" ], "id": "ea812c6f1a113fff", "outputs": [ { "data": { "text/plain": [ " peptide_id protein_id c1_rep1 c1_rep2 c1_rep3 c1_rep4 \\\n", "0 pept147 prot0 1740.912460 0.000000 1393.260017 4685.874636 \n", "1 pept424 prot0 3668.876134 0.000000 0.000000 303.011791 \n", "2 pept631 prot0 0.000000 3138.459061 3409.906069 1712.639948 \n", "\n", " c1_rep5 c2_rep1 c2_rep2 c2_rep3 c2_rep4 \\\n", "0 513.393605 502.109101 949.462139 0.000000 3006.548317 \n", "1 1314.382432 404.828763 3723.604607 11838.405382 7586.141805 \n", "2 987.488051 0.000000 8197.162348 0.000000 2067.977126 \n", "\n", " c2_rep5 c3_rep1 c3_rep2 c3_rep3 c3_rep4 \\\n", "0 671.891115 4123.628101 11583.385623 3114.882410 2812.034141 \n", "1 0.000000 336.363330 0.000000 200.425728 3891.630707 \n", "2 1111.872036 9229.125064 0.000000 19303.065270 2427.103374 \n", "\n", " c3_rep5 \n", "0 2195.550530 \n", "1 1395.146624 \n", "2 491.195810 " ], "text/html": [ "
| \n", " | peptide_id | \n", "protein_id | \n", "c1_rep1 | \n", "c1_rep2 | \n", "c1_rep3 | \n", "c1_rep4 | \n", "c1_rep5 | \n", "c2_rep1 | \n", "c2_rep2 | \n", "c2_rep3 | \n", "c2_rep4 | \n", "c2_rep5 | \n", "c3_rep1 | \n", "c3_rep2 | \n", "c3_rep3 | \n", "c3_rep4 | \n", "c3_rep5 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "pept147 | \n", "prot0 | \n", "1740.912460 | \n", "0.000000 | \n", "1393.260017 | \n", "4685.874636 | \n", "513.393605 | \n", "502.109101 | \n", "949.462139 | \n", "0.000000 | \n", "3006.548317 | \n", "671.891115 | \n", "4123.628101 | \n", "11583.385623 | \n", "3114.882410 | \n", "2812.034141 | \n", "2195.550530 | \n", "
| 1 | \n", "pept424 | \n", "prot0 | \n", "3668.876134 | \n", "0.000000 | \n", "0.000000 | \n", "303.011791 | \n", "1314.382432 | \n", "404.828763 | \n", "3723.604607 | \n", "11838.405382 | \n", "7586.141805 | \n", "0.000000 | \n", "336.363330 | \n", "0.000000 | \n", "200.425728 | \n", "3891.630707 | \n", "1395.146624 | \n", "
| 2 | \n", "pept631 | \n", "prot0 | \n", "0.000000 | \n", "3138.459061 | \n", "3409.906069 | \n", "1712.639948 | \n", "987.488051 | \n", "0.000000 | \n", "8197.162348 | \n", "0.000000 | \n", "2067.977126 | \n", "1111.872036 | \n", "9229.125064 | \n", "0.000000 | \n", "19303.065270 | \n", "2427.103374 | \n", "491.195810 | \n", "
| \n", " | n_peptides_c1_rep1 | \n", "n_peptides_c1_rep2 | \n", "n_peptides_c1_rep3 | \n", "n_peptides_c1_rep4 | \n", "n_peptides_c1_rep5 | \n", "n_peptides_c2_rep1 | \n", "n_peptides_c2_rep2 | \n", "n_peptides_c2_rep3 | \n", "n_peptides_c2_rep4 | \n", "n_peptides_c2_rep5 | \n", "... | \n", "intensity_c2_rep1 | \n", "intensity_c2_rep2 | \n", "intensity_c2_rep3 | \n", "intensity_c2_rep4 | \n", "intensity_c2_rep5 | \n", "intensity_c3_rep1 | \n", "intensity_c3_rep2 | \n", "intensity_c3_rep3 | \n", "intensity_c3_rep4 | \n", "intensity_c3_rep5 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| protein_id | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| prot0 | \n", "7 | \n", "5 | \n", "8 | \n", "8 | \n", "9 | \n", "8 | \n", "8 | \n", "7 | \n", "7 | \n", "8 | \n", "... | \n", "10905.868929 | \n", "29978.048873 | \n", "39279.995012 | \n", "16447.245791 | \n", "15476.546193 | \n", "22106.072629 | \n", "18638.301516 | \n", "41024.311866 | \n", "69277.036001 | \n", "10825.451618 | \n", "
| prot1 | \n", "9 | \n", "7 | \n", "8 | \n", "7 | \n", "5 | \n", "4 | \n", "7 | \n", "9 | \n", "9 | \n", "7 | \n", "... | \n", "6798.912758 | \n", "17346.823885 | \n", "14392.470307 | \n", "22301.020074 | \n", "34467.525125 | \n", "14886.915695 | \n", "35368.129215 | \n", "5180.276570 | \n", "13585.279883 | \n", "36810.953282 | \n", "
| prot10 | \n", "10 | \n", "9 | \n", "10 | \n", "6 | \n", "7 | \n", "8 | \n", "7 | \n", "6 | \n", "7 | \n", "7 | \n", "... | \n", "50901.080878 | \n", "16509.573663 | \n", "5293.420927 | \n", "31092.895805 | \n", "13232.936859 | \n", "9335.053026 | \n", "11066.451746 | \n", "5719.300722 | \n", "18697.848325 | \n", "8109.855595 | \n", "
3 rows × 30 columns
\n", "