[1]:
%load_ext autoreload
%autoreload 2
[2]:

import os import sys import torch import numpy as np import pandas as pd from mpra_test.mpra_test import MPRA_Dataset

Plant

[3]:
name_paper = 'Plant_2024_Jores'
name_dataset = 'native'

mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[3]:
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[4]:
mpra_dataset
[4]:
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[5]:
print(mpra_dataset)
MPRA_Dataset object with n_seq × n_readout = 365518 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)

[6]:
mpra_dataset.obs_X
[6]:
id lib species chr loci (start) loci (end) orientation
0 At-1 ACR At 1 1410 1580 Forward
1 At-1 ACR At 1 1580 1410 Reverse
2 At-10 ACR At 1 9640 9810 Forward
3 At-100 ACR At 1 108510 108680 Forward
4 At-100 ACR At 1 108680 108510 Reverse
... ... ... ... ... ... ... ...
365513 Zm-sh1531 ACR Zm 7 115534966 115534796 Reverse
365514 Zm-sh286 ACR Zm 1 291054873 291055043 Forward
365515 Zm-sh407 ACR Zm 2 80692168 80692338 Forward
365516 Zm-sh654 ACR Zm 3 85448476 85448306 Reverse
365517 Zm-sh779 ACR Zm 3 233053957 233053787 Reverse

365518 rows × 7 columns

[7]:
mpra_dataset.Y
[7]:
cold warm dark light maize
0 -0.255701 0.130293 0.187041 -0.088923 -0.507449
1 -0.189147 0.384606 0.020128 0.051520 -1.133404
2 0.039015 0.922137 0.600000 0.475028 -0.420744
3 0.450037 1.553443 2.049545 1.206474 0.345508
4 0.398944 0.386737 2.296798 1.175671 0.715303
... ... ... ... ... ...
365513 NaN NaN NaN NaN -0.576264
365514 NaN NaN NaN NaN -0.243445
365515 NaN NaN NaN NaN -0.358087
365516 NaN NaN NaN NaN -1.606015
365517 NaN NaN NaN NaN -1.383622

365518 rows × 5 columns

[8]:
print(mpra_dataset[mpra_dataset.Y['dark'] > 3])
MPRA_Dataset object with n_seq × n_readout = 13048 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)

[9]:
mpra_dataset[np.arange(3)]
[9]:
MPRA_Dataset object with n_seq × n_readout = 3 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[10]:
mpra_dataset[torch.arange(3)].shape
[10]:
(3, 13)
[11]:
mpra_dataset[[0,1,2]]
[11]:
MPRA_Dataset object with n_seq × n_readout = 3 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[12]:
mpra_dataset[mpra_dataset.obs_X['chr'].isin([2, 3, 5, 7])]
[12]:
MPRA_Dataset object with n_seq × n_readout = 172470 × 5
    obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
    obs Y: ''
    readout: 'cold', 'warm', 'dark', 'light', 'maize'
Additional information:
    MPRA Technique: Lentiviral MPRA
    Readout Assay: STARR-seq
    Regulatory Element: Enhancer
    Sequence Origin: Native
    Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm)
[13]:
mpra_dataset[[0,1,2]], mpra_dataset[:3]
[13]:
(MPRA_Dataset object with n_seq × n_readout = 3 × 5
     obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
     obs Y: ''
     readout: 'cold', 'warm', 'dark', 'light', 'maize'
 Additional information:
     MPRA Technique: Lentiviral MPRA
     Readout Assay: STARR-seq
     Regulatory Element: Enhancer
     Sequence Origin: Native
     Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm),
 MPRA_Dataset object with n_seq × n_readout = 4 × 5
     obs X: 'id', 'lib', 'species', 'chr', 'loci (start)', 'loci (end)', 'orientation'
     obs Y: ''
     readout: 'cold', 'warm', 'dark', 'light', 'maize'
 Additional information:
     MPRA Technique: Lentiviral MPRA
     Readout Assay: STARR-seq
     Regulatory Element: Enhancer
     Sequence Origin: Native
     Species: Arabidopsis thaliana (At), Solanum lycopersicum (Sl), Sorghum bicolor (Sb), Zea mays (Zm))
[14]:
chr_valid = [4, 9]
chr_infer = [2, 7]

idx_valid = mpra_dataset.obs_X['chr'].isin(chr_valid)
idx_infer = mpra_dataset.obs_X['chr'].isin(chr_infer)
idx_train = ~ (idx_valid | idx_infer)

[15]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
    cols_Y = ['cold', 'warm', 'dark', 'light'],
    batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
    cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
    cols_Y = ['Y: cold', 'Y: warm', 'Y: dark', 'Y: light'],
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)

print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset):  216247
len(dataloader_valid.dataset):  57836
len(dataloader_infer.dataset):  63737

Nature 2022 Regev

[16]:
name_paper = 'Nature_2022_Regev'
name_dataset = 'test_complex'

mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[16]:
MPRA_Dataset object with n_seq × n_readout = 3331 × 1
    obs X: ''
    obs Y: ''
    readout: 'expr'
Additional information:
    Description: Test data
    Grow Condition: YPD
    MPRA Technique: Classic MPRA
    Readout Assay: FACS
    Regulatory Element: Promoter
    Sequence Origin: Random
    Species: Yeast
[17]:
n = len(mpra_dataset)
idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])
[18]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)

print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset):  1666
len(dataloader_valid.dataset):  832
len(dataloader_infer.dataset):  833

Genome Research 2017 Seelig

[19]:
name_paper = 'GenomeResearch_2017_Seelig'
name_dataset = 'random'

mpra_dataset = MPRA_Dataset.load(name_paper, name_dataset)
mpra_dataset
[19]:
MPRA_Dataset object with n_seq × n_readout = 489348 × 1
    obs X: ''
    obs Y: 'raw count (input)', 'raw count (output)'
    readout: 'expr'
Additional information:
    Description: Random 5' UTRs
    MPRA Technique: Classic MPRA
    Readout Assay: RNA-seq
    Regulatory Element: 5' UTR
    Sequence Origin: Random
    Species: Yeast
[20]:
mpra_dataset.info
[20]:
{'Description': "Random 5' UTRs",
 'MPRA Technique': 'Classic MPRA',
 'Readout Assay': 'RNA-seq',
 'Regulatory Element': "5' UTR",
 'Sequence Origin': 'Random',
 'Species': 'Yeast'}
[21]:
n = len(mpra_dataset)
idx_valid, idx_infer, idx_train = np.split(np.random.permutation(n), [int(0.25*n), int(0.50*n)])
[22]:
batch_size = 64
num_workers = 4
dataloader_train = mpra_dataset[idx_train].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = True,
)
dataloader_valid = mpra_dataset[idx_valid].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)
dataloader_infer = mpra_dataset[idx_infer].to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)

print('len(dataloader_train.dataset): ', len(dataloader_train.dataset))
print('len(dataloader_valid.dataset): ', len(dataloader_valid.dataset))
print('len(dataloader_infer.dataset): ', len(dataloader_infer.dataset))
len(dataloader_train.dataset):  244674
len(dataloader_valid.dataset):  122337
len(dataloader_infer.dataset):  122337
[23]:
name_dataset_pred = 'native'
mpra_dataset_pred = MPRA_Dataset.load(name_paper, name_dataset_pred)

batch_size = 64
num_workers = 4
dataloader_pred = mpra_dataset_pred.to_DataLoader(
    batch_size = batch_size, num_workers = num_workers, shuffle = False,
)