{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "import os\n", "import sys\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from mpra_test.mpra_test import MPRA_Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plant" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_paper = 'Plant_2024_Jores'\n", "name_dataset = 'native'\n", "\n", "mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)\n", "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)\n", "\n" ] } ], "source": [ "print(mpra_dataset)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlibspecieschrloci (start)loci (end)orientation
0At-1ACRAt114101580Forward
1At-1ACRAt115801410Reverse
2At-10ACRAt196409810Forward
3At-100ACRAt1108510108680Forward
4At-100ACRAt1108680108510Reverse
........................
365513Zm-sh1531ACRZm7115534966115534796Reverse
365514Zm-sh286ACRZm1291054873291055043Forward
365515Zm-sh407ACRZm28069216880692338Forward
365516Zm-sh654ACRZm38544847685448306Reverse
365517Zm-sh779ACRZm3233053957233053787Reverse
\n", "

365518 rows × 7 columns

\n", "
" ], "text/plain": [ " id lib species chr loci (start) loci (end) orientation\n", "0 At-1 ACR At 1 1410 1580 Forward\n", "1 At-1 ACR At 1 1580 1410 Reverse\n", "2 At-10 ACR At 1 9640 9810 Forward\n", "3 At-100 ACR At 1 108510 108680 Forward\n", "4 At-100 ACR At 1 108680 108510 Reverse\n", "... ... ... ... ... ... ... ...\n", "365513 Zm-sh1531 ACR Zm 7 115534966 115534796 Reverse\n", "365514 Zm-sh286 ACR Zm 1 291054873 291055043 Forward\n", "365515 Zm-sh407 ACR Zm 2 80692168 80692338 Forward\n", "365516 Zm-sh654 ACR Zm 3 85448476 85448306 Reverse\n", "365517 Zm-sh779 ACR Zm 3 233053957 233053787 Reverse\n", "\n", "[365518 rows x 7 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset.obs_X" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
coldwarmdarklightmaize
0-0.2557010.1302930.187041-0.088923-0.507449
1-0.1891470.3846060.0201280.051520-1.133404
20.0390150.9221370.6000000.475028-0.420744
30.4500371.5534432.0495451.2064740.345508
40.3989440.3867372.2967981.1756710.715303
..................
365513NaNNaNNaNNaN-0.576264
365514NaNNaNNaNNaN-0.243445
365515NaNNaNNaNNaN-0.358087
365516NaNNaNNaNNaN-1.606015
365517NaNNaNNaNNaN-1.383622
\n", "

365518 rows × 5 columns

\n", "
" ], "text/plain": [ " cold warm dark light maize\n", "0 -0.255701 0.130293 0.187041 -0.088923 -0.507449\n", "1 -0.189147 0.384606 0.020128 0.051520 -1.133404\n", "2 0.039015 0.922137 0.600000 0.475028 -0.420744\n", "3 0.450037 1.553443 2.049545 1.206474 0.345508\n", "4 0.398944 0.386737 2.296798 1.175671 0.715303\n", "... ... ... ... ... ...\n", "365513 NaN NaN NaN NaN -0.576264\n", "365514 NaN NaN NaN NaN -0.243445\n", "365515 NaN NaN NaN NaN -0.358087\n", "365516 NaN NaN NaN NaN -1.606015\n", "365517 NaN NaN NaN NaN -1.383622\n", "\n", "[365518 rows x 5 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset.Y" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MPRA_Dataset object with n_seq × n_readout = 13048 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)\n", "\n" ] } ], "source": [ "print(mpra_dataset[mpra_dataset.Y['dark'] > 3])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 3 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset[np.arange(3)]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3, 13)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset[torch.arange(3)].shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 3 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset[[0,1,2]]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 172470 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset[mpra_dataset.obs_X['chr'].isin([2, 3, 5, 7])]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(MPRA_Dataset object with n_seq × n_readout = 3 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", " Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm),\n", " MPRA_Dataset object with n_seq × n_readout = 4 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", " Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset[[0,1,2]], mpra_dataset[:3]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "chr_valid = [4, 9]\n", "chr_infer = [2, 7]\n", "\n", "idx_valid = mpra_dataset.obs_X['chr'].isin(chr_valid)\n", "idx_infer = mpra_dataset.obs_X['chr'].isin(chr_infer)\n", "idx_train = ~ (idx_valid | idx_infer)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "len(dataloader_train.dataset): 216247\n", "len(dataloader_valid.dataset): 57836\n", "len(dataloader_infer.dataset): 63737\n" ] } ], "source": [ "batch_size = 64\n", "num_workers = 4\n", "dataloader_train = mpra_dataset[idx_train].to_DataLoader(\n", " cols_Y = ['cold', 'warm', 'dark', 'light'],\n", " batch_size = batch_size, num_workers = num_workers, shuffle = True, \n", ")\n", "dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(\n", " cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(\n", " cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "\n", "print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))\n", "print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))\n", "print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Nature 2022 Regev" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 3331 × 1\n", " obs X: ''\n", " obs Y: ''\n", " readout: 'expr'\n", "Additional information:\n", " Description: Test data\n", " Grow Condition: YPD\n", " MPRA Technique: Classic MPRA\n", " Readout Assay: FACS\n", " Regulatory Element: Promoter\n", " Sequence Origin: Random\n", " Species: Yeast" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_paper = 'Nature_2022_Regev'\n", "name_dataset = 'test_complex'\n", "\n", "mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)\n", "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "n = len(mpra_dataset)\n", "idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "len(dataloader_train.dataset): 1666\n", "len(dataloader_valid.dataset): 832\n", "len(dataloader_infer.dataset): 833\n" ] } ], "source": [ "batch_size = 64\n", "num_workers = 4\n", "dataloader_train = mpra_dataset[idx_train].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = True, \n", ")\n", "dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "\n", "print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))\n", "print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))\n", "print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Genome Research 2017 Seelig" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 489348 × 1\n", " obs X: ''\n", " obs Y: 'raw count (input)', 'raw count (output)'\n", " readout: 'expr'\n", "Additional information:\n", " Description: Random 5' UTRs\n", " MPRA Technique: Classic MPRA\n", " Readout Assay: RNA-seq\n", " Regulatory Element: 5' UTR\n", " Sequence Origin: Random\n", " Species: Yeast" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_paper = 'GenomeResearch_2017_Seelig'\n", "name_dataset = 'random'\n", "\n", "mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)\n", "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Description': \"Random 5' UTRs\",\n", " 'MPRA Technique': 'Classic MPRA',\n", " 'Readout Assay': 'RNA-seq',\n", " 'Regulatory Element': \"5' UTR\",\n", " 'Sequence Origin': 'Random',\n", " 'Species': 'Yeast'}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset.info" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "n = len(mpra_dataset)\n", "idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "len(dataloader_train.dataset): 244674\n", "len(dataloader_valid.dataset): 122337\n", "len(dataloader_infer.dataset): 122337\n" ] } ], "source": [ "batch_size = 64\n", "num_workers = 4\n", "dataloader_train = mpra_dataset[idx_train].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = True, \n", ")\n", "dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")\n", "\n", "print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))\n", "print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))\n", "print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "name_dataset_pred = 'native'\n", "mpra_dataset_pred = MPRA_Dataset.load(name_paper, name_dataset_pred)\n", "\n", "batch_size = 64\n", "num_workers = 4\n", "dataloader_pred = mpra_dataset_pred.to_DataLoader(\n", " batch_size = batch_size, num_workers = num_workers, shuffle = False, \n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }