multithreading - Multi-threading for downloading NCBI files in Python -
so have taken on task of downloading large collection of files ncbi database. have run times have create multiple databases. code here works downloads viruses ncbi website. question there way speed process of downloading these files.
currently runtime of program more 5hours. have looked multi-threading , never work because of these files take more 10seconds download , not know how handle stalling. (new programing) there way of handling urllib2.httperror: http error 502: bad gateway. with combinations of retstart , retmax. crashes program , have restart download different location changingthe 0 in statement.
import urllib2 beautifulsoup import beautifulsoup #this searchquery ncbi. spaces replaced +'s. searchquery = 'viruses[orgn]+not+retroviridae[orgn]' #this database searching. database = 'protein' #this output file data output = 'sample.fasta' #this base url ncbi eutils. base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' #create search string information above esearch = 'esearch.fcgi?db='+database+'&term='+searchquery+'&usehistory=y' #create esearch url url = base + esearch #fetch esearch using urllib2 print url content = urllib2.urlopen(url) #open url in beautifulsoup doc = beautifulsoup(content) #grab amount of hits in search count = int(doc.find('count').string) #grab webenv or history of search usehistory. webenv = doc.find('webenv').string #grab querykey querykey = doc.find('querykey').string #set max amount of files fetch @ time. default 500 files. retmax = 10000 #create fetch string efetch = 'efetch.fcgi?db='+database+'&webenv='+webenv+'&query_key='+querykey #select output format , file format of files. #for table visit: http://www.ncbi.nlm.nih.gov/books/nbk25499/table/chapter4.chapter4_table1 format = 'fasta' type = 'text' #create options string efetch options = '&rettype='+format+'&retmode='+type #for statement 0 count counting retmax. use xrange on range in xrange(0,count,retmax): #create position string poision = '&retstart='+str(i)+'&retmax='+str(retmax) #create efetch url url = base + efetch + poision + options print url #grab results response = urllib2.urlopen(url) #write output file open(output, 'a') file: line in response.readlines(): file.write(line) #gives sense of print count - - retmax
to download files using multiple threads:
#!/usr/bin/env python import shutil contextlib import closing multiprocessing.dummy import pool # use threads urllib2 import urlopen def generate_urls(some, params): #xxx pass whatever parameters need restart in range(*params): # ... generate url, filename yield url, filename def download((url, filename)): try: closing(urlopen(url)) response, open(filename, 'wb') file: shutil.copyfileobj(response, file) except exception e: return (url, filename), repr(e) else: # success return (url, filename), none def main(): pool = pool(20) # @ 20 concurrent downloads urls = generate_urls(some, params) (url, filename), error in pool.imap_unordered(download, urls): if error not none: print("can't download {url} {filename}, " "reason: {error}".format(**locals()) if __name__ == "__main__": main()
Comments
Post a Comment