Spaces:
Runtime error
Runtime error
# CUDA_VISIBLE_DEVICES=2 python -m torch.distributed.launch --nproc_per_node=1 --master_port 3303 Predictor.py --predict_file /home/ubuntu/Experimental_Data/v1_5UTR_seqs_with_v1Label.fasta --outdir /home/ubuntu/Experimental_Data/try --outfilename try_RVACv1 | |
import os | |
from Bio import SeqIO | |
import sys | |
# import argparse | |
# from argparse import Namespace | |
# import pathlib | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
# import esm | |
# from esm.data import * | |
# from esm.model.esm2_secondarystructure import ESM2 as ESM2_SISS | |
from esm.model.esm2 import ESM2 as ESM2_SISS | |
# from esm.model.esm2_supervised import ESM2 | |
from esm import Alphabet, FastaBatchedDataset#, ProteinBertModel, pretrained, MSATransformer | |
import numpy as np | |
import pandas as pd | |
import random | |
# import math | |
# import scipy.stats as stats | |
# from scipy.stats import spearmanr, pearsonr | |
# from sklearn import preprocessing | |
# from copy import deepcopy | |
from tqdm import tqdm#, trange | |
# import matplotlib.pyplot as plt | |
# import seaborn as sns | |
# from sklearn.model_selection import KFold | |
# from torch.optim.lr_scheduler import StepLR | |
# import torch.distributed as dist | |
# from torch.nn.parallel import DistributedDataParallel | |
# from torch.utils.data.distributed import DistributedSampler | |
from io import StringIO | |
seed = 1337 | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
# torch.cuda.manual_seed(seed) | |
# torch.cuda.manual_seed_all(seed) | |
# parser = argparse.ArgumentParser() | |
# parser.add_argument('--device_ids', type=str, default='0', help="Training Devices") | |
# parser.add_argument('--local-rank', type=int, default=-1, help="DDP parameter, do not modify") | |
# parser.add_argument('--outdir', type=str, default = '/home/ubuntu/Experimental_Data/try') | |
# parser.add_argument('--outfilename', type=str, default = 'try_RVACv1') | |
# parser.add_argument('--predict_file', type = str, default = '/home/ubuntu/Experimental_Data/v1_5UTR_seqs_with_v1Label.fasta') | |
# args = parser.parse_args() | |
# print(args) | |
global modelfile, layers, heads, embed_dim, batch_toks, inp_len, device_ids, device | |
modelfile = 'model.pkl' | |
# model_info = modelfile.split('/')[-1].split('_') | |
# for item in model_info: | |
# if 'layers' in item: | |
# layers = int(item[0]) | |
# elif 'heads' in item: | |
# heads = int(item[:-5]) | |
# elif 'embedsize' in item: | |
# embed_dim = int(item[:-9]) | |
# elif 'batchToks' in item: | |
# batch_toks = 4096 | |
layers = 6 | |
heads = 16 | |
embed_dim = 128 | |
batch_toks = 4096 | |
inp_len = 50 | |
# device_ids = list(map(int, args.device_ids.split(','))) | |
# dist.init_process_group(backend='nccl') | |
# device = torch.device('cuda:{}'.format(device_ids[args.local_rank])) | |
device = "cpu" | |
# torch.cuda.set_device(device) | |
# local_rank = args.local_rank | |
local_rank = -1 | |
# storage_id = int(device_ids[local_rank]) | |
storage_id = 0 | |
# repr_layers = [layers] | |
include = ["mean"] | |
class CNN_linear(nn.Module): | |
def __init__(self, | |
border_mode='same', filter_len=8, nbr_filters=120, | |
dropout1=0, dropout2=0): | |
super(CNN_linear, self).__init__() | |
self.embedding_size = embed_dim | |
self.border_mode = border_mode | |
self.inp_len = inp_len | |
self.nodes = 40 | |
self.cnn_layers = 0 | |
self.filter_len = filter_len | |
self.nbr_filters = nbr_filters | |
self.dropout1 = dropout1 | |
self.dropout2 = dropout2 | |
self.dropout3 = 0.5 | |
self.esm2 = ESM2_SISS(num_layers = layers, | |
embed_dim = embed_dim, | |
attention_heads = heads, | |
alphabet = alphabet) | |
self.conv1 = nn.Conv1d(in_channels = self.embedding_size, | |
out_channels = self.nbr_filters, kernel_size = self.filter_len, padding = self.border_mode) | |
self.conv2 = nn.Conv1d(in_channels = self.nbr_filters, | |
out_channels = self.nbr_filters, kernel_size = self.filter_len, padding = self.border_mode) | |
self.dropout1 = nn.Dropout(self.dropout1) | |
self.dropout2 = nn.Dropout(self.dropout2) | |
self.dropout3 = nn.Dropout(self.dropout3) | |
self.relu = nn.ReLU() | |
self.flatten = nn.Flatten() | |
self.fc = nn.Linear(in_features = embed_dim, out_features = self.nodes) | |
self.linear = nn.Linear(in_features = self.nbr_filters, out_features = self.nodes) | |
self.output = nn.Linear(in_features = self.nodes, out_features = 1) | |
self.direct_output = nn.Linear(in_features = embed_dim, out_features = 1) | |
self.magic_output = nn.Linear(in_features = 1, out_features = 1) | |
def forward(self, tokens, need_head_weights=True, return_contacts=False, return_representation=True): | |
# x = self.esm2(tokens, [layers], need_head_weights, return_contacts, return_representation) | |
x = self.esm2(tokens, [layers]) | |
x = x["representations"][layers][:, 0] | |
x_o = x.unsqueeze(2) | |
x = self.flatten(x_o) | |
o_linear = self.fc(x) | |
o_relu = self.relu(o_linear) | |
o_dropout = self.dropout3(o_relu) | |
o = self.output(o_dropout) | |
return o | |
def eval_step(dataloader, model, threshold = 0.5): | |
model.eval() | |
y_pred_list, y_prob_list = [], [] | |
ids_list, strs_list = [], [] | |
with torch.no_grad(): | |
# for (ids, strs, _, toks, _, _) in tqdm(dataloader): | |
for ids, strs, toks in tqdm(dataloader): | |
ids_list.extend(ids) | |
strs_list.extend(strs) | |
# toks = toks.to(device) | |
# print(toks) | |
logits = model(toks) | |
logits = logits.reshape(-1) | |
y_prob = torch.sigmoid(logits) | |
y_pred = (y_prob > threshold).long() | |
y_prob_list.extend(y_prob.cpu().detach().tolist()) | |
y_pred_list.extend(y_pred.cpu().detach().tolist()) | |
data_pred = pd.DataFrame([ids_list, strs_list, y_prob_list, y_pred_list], index = ['ID', 'Sequence', "Probability as 5'UTR", "Prediction as 5'UTR"]).T | |
return data_pred | |
def generate_dataset_dataloader(ids, seqs): | |
# dataset = FastaBatchedDataset(ids, seqs, mask_prob = 0.0) | |
dataset = FastaBatchedDataset(ids, seqs) | |
batches = dataset.get_batch_indices(toks_per_batch=batch_toks, extra_toks_per_seq=2) | |
dataloader = torch.utils.data.DataLoader(dataset, | |
collate_fn=alphabet.get_batch_converter(), | |
batch_sampler=batches, | |
shuffle = False) | |
print(f"{len(dataset)} sequences") | |
return dataset, dataloader | |
def read_fasta(file): | |
# 判断文件是否为空 | |
if os.path.getsize(file) == 0: | |
print("Error: The file is empty!") | |
sys.exit() | |
ids = [] | |
sequences = [] | |
for record in SeqIO.parse(file, "fasta"): | |
# 检查序列的开头是否为">" | |
# if not record.id.startswith('>'): | |
# print(f"Error: The sequence '{record.id}' is not properly formatted, it does not start with '>'. Skipping...") | |
# continue | |
# 检查序列是否只包含A, G, C, T | |
sequence = str(record.seq).upper()[-inp_len:] | |
if not set(sequence).issubset(set("AGCT")): | |
print(f"Error: The sequence '{record.description}' contains invalid characters. Only A, G, C, T are allowed. Skipping...") | |
continue | |
# 将符合条件的序列添加到列表中 | |
ids.append(record.id) | |
sequences.append(sequence) | |
return ids, sequences | |
def read_raw(raw_input): | |
ids = [] | |
sequences = [] | |
file = StringIO(raw_input) | |
for record in SeqIO.parse(file, "fasta"): | |
# 检查序列的开头是否为">" | |
# if not record.id.startswith('>'): | |
# print(f"Error: The sequence '{record.id}' is not properly formatted, it does not start with '>'. Skipping...") | |
# continue | |
# 检查序列是否只包含A, G, C, T | |
sequence = str(record.seq).upper()[-inp_len:] | |
if not set(sequence).issubset(set("AGCT")): | |
print(f"Error: The sequence '{record.description}' contains invalid characters. Only A, G, C, T are allowed. Skipping...") | |
continue | |
# 将符合条件的序列添加到列表中 | |
ids.append(record.id) | |
sequences.append(sequence) | |
return ids, sequences | |
####### | |
# alphabet = Alphabet(mask_prob = 0.0, standard_toks = 'AGCT') | |
alphabet = Alphabet(prepend_toks=("<pad>", "<eos>", "<unk>"), standard_toks = 'AGCT', append_toks=("<cls>", "<mask>", "<sep>")) | |
# print(alphabet.tok_to_idx) | |
# assert alphabet.tok_to_idx == {'<pad>': 0, '<eos>': 1, '<unk>': 2, 'A': 3, 'G': 4, 'C': 5, 'T': 6, '<cls>': 7, '<mask>': 8, '<sep>': 9} | |
alphabet.tok_to_idx = {'<pad>': 0, '<eos>': 1, '<unk>': 2, 'A': 3, 'G': 4, 'C': 5, 'T': 6, '<cls>': 7, '<mask>': 8, '<sep>': 9} | |
def predict_file(input_file): | |
print('====Load Data====') | |
ids, seqs = read_fasta(input_file) | |
_, dataloader = generate_dataset_dataloader(ids, seqs) | |
model = CNN_linear().to(device) | |
# model.load_state_dict({k.replace('module.', ''):v for k,v in torch.load(modelfile, map_location=lambda storage, loc : storage.cuda(storage_id)).items()}, strict = False) | |
model.load_state_dict({k.replace('module.', ''):v for k,v in torch.load(modelfile, map_location=torch.device('cpu')).items()}, strict = False) | |
# model = DistributedDataParallel(model, device_ids=[device_ids[local_rank]], output_device=device_ids[local_rank], find_unused_parameters=True) | |
print('====Predict====') | |
pred = eval_step(dataloader, model) | |
print(pred) | |
# print('====Save Results====') | |
# if not os.path.exists(args.outdir): os.makedirs(args.outdir) | |
# pred.to_csv(f'{args.outdir}/{args.outfilename}_prediction_results.csv', index = False) | |
def predict_raw(raw_input): | |
print('====Parse Input====') | |
ids, seqs = read_raw(raw_input) | |
_, dataloader = generate_dataset_dataloader(ids, seqs) | |
model = CNN_linear().to(device) | |
# model.load_state_dict({k.replace('module.', ''):v for k,v in torch.load(modelfile, map_location=lambda storage, loc : storage.cuda(storage_id)).items()}, strict = False) | |
model.load_state_dict({k.replace('module.', ''):v for k,v in torch.load(modelfile, map_location=torch.device('cpu')).items()}, strict = False) | |
# model = DistributedDataParallel(model, device_ids=[device_ids[local_rank]], output_device=device_ids[local_rank], find_unused_parameters=True) | |
print('====Predict====') | |
pred = eval_step(dataloader, model) | |
print(pred) |