[1]:
%load_ext autoreload
%autoreload 2
[2]:
import os
import sys
import torch
import numpy as np
import pandas as pd
from mpra_test.mpra_test import MPRA_Dataset
Plant¶
[3]:
name_paper = 'Plant_2024_Jores'
name_dataset = 'native'
mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[3]:
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[4]:
mpra_dataset
[4]:
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[5]:
print(mpra_dataset)
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[6]:
mpra_dataset.obs_X
[6]:
| id | lib | species | chr | loci (start) | loci (end) | orientation | |
|---|---|---|---|---|---|---|---|
| 0 | At-1 | ACR | At | 1 | 1410 | 1580 | Forward |
| 1 | At-1 | ACR | At | 1 | 1580 | 1410 | Reverse |
| 2 | At-10 | ACR | At | 1 | 9640 | 9810 | Forward |
| 3 | At-100 | ACR | At | 1 | 108510 | 108680 | Forward |
| 4 | At-100 | ACR | At | 1 | 108680 | 108510 | Reverse |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 365513 | Zm-sh1531 | ACR | Zm | 7 | 115534966 | 115534796 | Reverse |
| 365514 | Zm-sh286 | ACR | Zm | 1 | 291054873 | 291055043 | Forward |
| 365515 | Zm-sh407 | ACR | Zm | 2 | 80692168 | 80692338 | Forward |
| 365516 | Zm-sh654 | ACR | Zm | 3 | 85448476 | 85448306 | Reverse |
| 365517 | Zm-sh779 | ACR | Zm | 3 | 233053957 | 233053787 | Reverse |
365518 rows × 7 columns
[7]:
mpra_dataset.Y
[7]:
| cold | warm | dark | light | maize | |
|---|---|---|---|---|---|
| 0 | -0.255701 | 0.130293 | 0.187041 | -0.088923 | -0.507449 |
| 1 | -0.189147 | 0.384606 | 0.020128 | 0.051520 | -1.133404 |
| 2 | 0.039015 | 0.922137 | 0.600000 | 0.475028 | -0.420744 |
| 3 | 0.450037 | 1.553443 | 2.049545 | 1.206474 | 0.345508 |
| 4 | 0.398944 | 0.386737 | 2.296798 | 1.175671 | 0.715303 |
| ... | ... | ... | ... | ... | ... |
| 365513 | NaN | NaN | NaN | NaN | -0.576264 |
| 365514 | NaN | NaN | NaN | NaN | -0.243445 |
| 365515 | NaN | NaN | NaN | NaN | -0.358087 |
| 365516 | NaN | NaN | NaN | NaN | -1.606015 |
| 365517 | NaN | NaN | NaN | NaN | -1.383622 |
365518 rows × 5 columns
[8]:
print(mpra_dataset[mpra_dataset.Y['dark'] > 3])
MPRA_Dataset object with n_seq × n_readout = 13048 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[9]:
mpra_dataset[np.arange(3)]
[9]:
MPRA_Dataset object with n_seq × n_readout = 3 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[10]:
mpra_dataset[torch.arange(3)].shape
[10]:
(3, 13)
[11]:
mpra_dataset[[0,1,2]]
[11]:
MPRA_Dataset object with n_seq × n_readout = 3 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[12]:
mpra_dataset[mpra_dataset.obs_X['chr'].isin([2, 3, 5, 7])]
[12]:
MPRA_Dataset object with n_seq × n_readout = 172470 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[13]:
mpra_dataset[[0,1,2]], mpra_dataset[:3]
[13]:
(MPRA_Dataset object with n_seq × n_readout = 3 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm),
MPRA_Dataset object with n_seq × n_readout = 4 × 5
obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
obs Y: ''
readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
MPRA Technique: Lentiviral MPRA
Readout Assay: STARR-seq
Regulatory Element: Enhancer
Sequence Origin: Native
Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm))
[14]:
chr_valid = [4, 9]
chr_infer = [2, 7]
idx_valid = mpra_dataset.obs_X['chr'].isin(chr_valid)
idx_infer = mpra_dataset.obs_X['chr'].isin(chr_infer)
idx_train = ~ (idx_valid | idx_infer)
[15]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
cols_Y = ['cold', 'warm', 'dark', 'light'],
batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset): 216247
len(dataloader_valid.dataset): 57836
len(dataloader_infer.dataset): 63737
Nature 2022 Regev¶
[16]:
name_paper = 'Nature_2022_Regev'
name_dataset = 'test_complex'
mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[16]:
MPRA_Dataset object with n_seq × n_readout = 3331 × 1
obs X: ''
obs Y: ''
readout: 'expr'
Additional information:
Description: Test data
Grow Condition: YPD
MPRA Technique: Classic MPRA
Readout Assay: FACS
Regulatory Element: Promoter
Sequence Origin: Random
Species: Yeast
[17]:
n = len(mpra_dataset)
idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])
[18]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset): 1666
len(dataloader_valid.dataset): 832
len(dataloader_infer.dataset): 833
Genome Research 2017 Seelig¶
[19]:
name_paper = 'GenomeResearch_2017_Seelig'
name_dataset = 'random'
mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[19]:
MPRA_Dataset object with n_seq × n_readout = 489348 × 1
obs X: ''
obs Y: 'raw count (input)', 'raw count (output)'
readout: 'expr'
Additional information:
Description: Random 5' UTRs
MPRA Technique: Classic MPRA
Readout Assay: RNA-seq
Regulatory Element: 5' UTR
Sequence Origin: Random
Species: Yeast
[20]:
mpra_dataset.info
[20]:
{'Description': "Random 5' UTRs",
'MPRA Technique': 'Classic MPRA',
'Readout Assay': 'RNA-seq',
'Regulatory Element': "5' UTR",
'Sequence Origin': 'Random',
'Species': 'Yeast'}
[21]:
n = len(mpra_dataset)
idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])
[22]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset): 244674
len(dataloader_valid.dataset): 122337
len(dataloader_infer.dataset): 122337
[23]:
name_dataset_pred = 'native'
mpra_dataset_pred = MPRA_Dataset.load(name_paper, name_dataset_pred)
batch_size = 64
num_workers = 4
dataloader_pred = mpra_dataset_pred.to_DataLoader(
batch_size = batch_size, num_workers = num_workers, shuffle = False,
)