-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhisat_map.py
59 lines (54 loc) · 2.25 KB
/
hisat_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#hisat_map.py
#Russell O. Stewart
#7/18/2017
#Executes HISAT mapping software for trimmed fastq files (outputted by skewer)
#Before running this, make sure that you have built a HISAT index:
#/path/to/hisat/directory/hisat-build -f /data/reference/iGenomes/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa Homo_sapiens_index
#HISAT reference manual:
#http://ccb.jhu.edu/software/hisat/manual.shtml
#Download HISAT on Github:
#https://github.com/infphilo/hisat.git
import os
import time
import subprocess
#make sure to update these!!!
srcDir = '/Seibold/proj/Russell_playground/skewer'
destDir = '/Seibold/proj/Russell_playground/hiSatAligner'
runThreadN = 12
hisatIndexNamePrefix = 'Homo_sapiens_index'
email = 'russells98@gmail.com'
gtfFile='/data/reference/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf'
#makes and returns a new directory with given directory path and name of dir
def mkDir(path , name):
newDir = os.path.join(path , name)
if not os.path.isdir(newDir):
os.makedirs(newDir)
return(newDir)
folders = os.listdir(srcDir)
hisatResultsDir = mkDir(destDir , 'hisatResults')
#find the trimmed fastq files and turn them into a vector of samples.
#this uses the file heirarchy outputted by skewer.
samples = []
sampleDestDir = []
i = 0
for folder in folders:
files = os.listdir(srcDir + '/' + folder)
samples.append([])
for f in files:
if not f.find('untrimmed') > 0 and f.find('.fastq.gz') > 0:
samples[i].append(folder + '/' + f)
tempName = f[:f.find('-trimmed')]
sampleDestDir.append(mkDir(hisatResultsDir , tempName))
samfile = open(sampleDestDir[i] + '/' + tempName + '_SAM_out.sam' , 'w')
samfile.write('')
samfile.close()
i += 1
#run HISAT for each sample
j = 0
for sample in samples:
command = "bsub -u %s -R \"rusage[mem=30000]\" '/Seibold/proj/Russell_playground/hiSatAligner/hisat/hisat -q -t -p %d --known-splicesite-infile %s -x %s -1 %s -2 %s -S %s'" % (email , runThreadN , gtfFile , hisatIndexNamePrefix , srcDir + '/' + sample[0] , srcDir + '/' + sample[1] , sampleDestDir[j] + '/' + sample[1][:sample[1].find('/')] + '_SAM_out.sam')
print command + '\n'
coreID = subprocess.check_output(command , shell=True)
print coreID
j += 1
time.sleep(3)