6
6
import logging
7
7
import os
8
8
import re
9
- import requests
10
9
import tarfile
11
10
import tempfile
12
-
13
11
from datetime import timedelta
14
12
from functools import lru_cache
15
13
from os .path import join
16
- from subprocess import Popen
17
14
from urllib .parse import urlparse , parse_qs
18
15
from xml .etree import ElementTree
19
16
20
17
import luigi
21
18
import requests
22
-
23
19
from bioluigi .tasks .utils import DynamicTaskWithOutputMixin , DynamicWrapperTask , TaskWithMetadataMixin
24
20
from luigi .util import requires
25
21
22
+ from .sra import DownloadSraExperiment
26
23
from ..config import rnaseq_pipeline
27
24
from ..miniml_utils import collect_geo_samples , collect_geo_samples_info
28
- from ..platforms import Platform , BgiPlatform , IlluminaPlatform
25
+ from ..platforms import BgiPlatform , IlluminaPlatform
29
26
from ..targets import ExpirableLocalTarget
30
27
from ..utils import RerunnableTaskMixin
31
- from .sra import DownloadSraExperiment
32
28
33
29
cfg = rnaseq_pipeline ()
34
30
@@ -53,7 +49,8 @@ def match_geo_platform(geo_platform):
53
49
return BgiPlatform (geo_platform_title .split (' ' )[0 ])
54
50
55
51
# Illumina HiSeq X and NextSeq 550 platforms are not prefixed with Illumina
56
- illumina_regex = [r'Illumina (.+) \(.+\)' , r'(HiSeq X .+) \(.+\)' , r'(NextSeq 550) \(.+\)' , r'(NextSeq 2000) \(.+\)' ]
52
+ illumina_regex = [r'Illumina (.+) \(.+\)' , r'(HiSeq X .+) \(.+\)' , r'(NextSeq 550) \(.+\)' ,
53
+ r'(NextSeq 2000) \(.+\)' ]
57
54
58
55
for r in illumina_regex :
59
56
illumina_match = re .match (r , geo_platform_title )
@@ -85,7 +82,8 @@ def run(self):
85
82
f .write (res .text )
86
83
87
84
def output (self ):
88
- return ExpirableLocalTarget (join (cfg .OUTPUT_DIR , cfg .METADATA , 'geo' , '{}.xml' .format (self .gsm )), ttl = timedelta (days = 14 ))
85
+ return ExpirableLocalTarget (join (cfg .OUTPUT_DIR , cfg .METADATA , 'geo' , '{}.xml' .format (self .gsm )),
86
+ ttl = timedelta (days = 14 ))
89
87
90
88
@requires (DownloadGeoSampleMetadata )
91
89
class DownloadGeoSample (DynamicTaskWithOutputMixin , DynamicWrapperTask ):
@@ -131,22 +129,25 @@ class DownloadGeoSeriesMetadata(TaskWithMetadataMixin, RerunnableTaskMixin, luig
131
129
def run (self ):
132
130
if self .output ().is_stale ():
133
131
logger .info ('%s is stale, redownloading...' , self .output ())
134
- res = requests .get ('https://ftp.ncbi.nlm.nih.gov/geo/series/' + self .gse [:- 3 ] + 'nnn/' + self .gse + '/miniml/' + self .gse + '_family.xml.tgz' , stream = True )
132
+ res = requests .get ('https://ftp.ncbi.nlm.nih.gov/geo/series/' + self .gse [
133
+ :- 3 ] + 'nnn/' + self .gse + '/miniml/' + self .gse + '_family.xml.tgz' ,
134
+ stream = True )
135
135
res .raise_for_status ()
136
136
# we need to use a temporary file because Response.raw does not allow seeking
137
137
with tempfile .TemporaryFile () as tmp :
138
138
for chunk in res .iter_content (chunk_size = 1024 ):
139
139
tmp .write (chunk )
140
140
tmp .seek (0 )
141
- with tarfile .open (fileobj = tmp , mode = 'r:gz' ) as fin , self .output ().temporary_path () as fpath , open (fpath , 'wb' ) as f :
141
+ with tarfile .open (fileobj = tmp , mode = 'r:gz' ) as fin , self .output ().temporary_path () as fpath , open (fpath ,
142
+ 'wb' ) as f :
142
143
reader = fin .extractfile (self .gse + '_family.xml' )
143
144
while chunk := reader .read (1024 ):
144
145
f .write (chunk )
145
146
146
-
147
147
def output (self ):
148
148
# TODO: remove the _family suffix
149
- return ExpirableLocalTarget (join (cfg .OUTPUT_DIR , cfg .METADATA , 'geo' , '{}_family.xml' .format (self .gse )), ttl = timedelta (days = 14 ))
149
+ return ExpirableLocalTarget (join (cfg .OUTPUT_DIR , cfg .METADATA , 'geo' , '{}_family.xml' .format (self .gse )),
150
+ ttl = timedelta (days = 14 ))
150
151
151
152
@requires (DownloadGeoSeriesMetadata )
152
153
class DownloadGeoSeries (DynamicTaskWithOutputMixin , DynamicWrapperTask ):
@@ -177,7 +178,9 @@ def run(self):
177
178
with self .output ().open ('w' ) as info_out :
178
179
for sample in samples :
179
180
if len (sample .output ()) == 0 :
180
- logger .warning ('GEO sample %s has no associated FASTQs from which batch information can be extracted.' , sample .sample_id )
181
+ logger .warning (
182
+ 'GEO sample %s has no associated FASTQs from which batch information can be extracted.' ,
183
+ sample .sample_id )
181
184
continue
182
185
183
186
# TODO: find a cleaner way to obtain the SRA run accession
0 commit comments