{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "import os\n", "import sys\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from mpra_test.mpra_test import MPRA_Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plant" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_paper = 'Plant_2024_Jores'\n", "name_dataset = 'native'\n", "\n", "mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)\n", "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpra_dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MPRA_Dataset object with n_seq × n_readout = 365518 × 5\n", " obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'\n", " obs Y: ''\n", " readout: 'cold', 'warm', 'dark', 'light', 'maize'\n", "Additional information:\n", " MPRA Technique: Lentiviral MPRA\n", " Readout Assay: STARR-seq\n", " Regulatory Element: Enhancer\n", " Sequence Origin: Native\n", " Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)\n", "\n" ] } ], "source": [ "print(mpra_dataset)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "lib | \n", "species | \n", "chr | \n", "loci (start) | \n", "loci (end) | \n", "orientation | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "At-1 | \n", "ACR | \n", "At | \n", "1 | \n", "1410 | \n", "1580 | \n", "Forward | \n", "
| 1 | \n", "At-1 | \n", "ACR | \n", "At | \n", "1 | \n", "1580 | \n", "1410 | \n", "Reverse | \n", "
| 2 | \n", "At-10 | \n", "ACR | \n", "At | \n", "1 | \n", "9640 | \n", "9810 | \n", "Forward | \n", "
| 3 | \n", "At-100 | \n", "ACR | \n", "At | \n", "1 | \n", "108510 | \n", "108680 | \n", "Forward | \n", "
| 4 | \n", "At-100 | \n", "ACR | \n", "At | \n", "1 | \n", "108680 | \n", "108510 | \n", "Reverse | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 365513 | \n", "Zm-sh1531 | \n", "ACR | \n", "Zm | \n", "7 | \n", "115534966 | \n", "115534796 | \n", "Reverse | \n", "
| 365514 | \n", "Zm-sh286 | \n", "ACR | \n", "Zm | \n", "1 | \n", "291054873 | \n", "291055043 | \n", "Forward | \n", "
| 365515 | \n", "Zm-sh407 | \n", "ACR | \n", "Zm | \n", "2 | \n", "80692168 | \n", "80692338 | \n", "Forward | \n", "
| 365516 | \n", "Zm-sh654 | \n", "ACR | \n", "Zm | \n", "3 | \n", "85448476 | \n", "85448306 | \n", "Reverse | \n", "
| 365517 | \n", "Zm-sh779 | \n", "ACR | \n", "Zm | \n", "3 | \n", "233053957 | \n", "233053787 | \n", "Reverse | \n", "
365518 rows × 7 columns
\n", "| \n", " | cold | \n", "warm | \n", "dark | \n", "light | \n", "maize | \n", "
|---|---|---|---|---|---|
| 0 | \n", "-0.255701 | \n", "0.130293 | \n", "0.187041 | \n", "-0.088923 | \n", "-0.507449 | \n", "
| 1 | \n", "-0.189147 | \n", "0.384606 | \n", "0.020128 | \n", "0.051520 | \n", "-1.133404 | \n", "
| 2 | \n", "0.039015 | \n", "0.922137 | \n", "0.600000 | \n", "0.475028 | \n", "-0.420744 | \n", "
| 3 | \n", "0.450037 | \n", "1.553443 | \n", "2.049545 | \n", "1.206474 | \n", "0.345508 | \n", "
| 4 | \n", "0.398944 | \n", "0.386737 | \n", "2.296798 | \n", "1.175671 | \n", "0.715303 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 365513 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-0.576264 | \n", "
| 365514 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-0.243445 | \n", "
| 365515 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-0.358087 | \n", "
| 365516 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-1.606015 | \n", "
| 365517 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-1.383622 | \n", "
365518 rows × 5 columns
\n", "