multithreading - Multi-threading for downloading NCBI files in Python -


so have taken on task of downloading large collection of files ncbi database. have run times have create multiple databases. code here works downloads viruses ncbi website. question there way speed process of downloading these files.

currently runtime of program more 5hours. have looked multi-threading , never work because of these files take more 10seconds download , not know how handle stalling. (new programing) there way of handling urllib2.httperror: http error 502: bad gateway. with combinations of retstart , retmax. crashes program , have restart download different location changingthe 0 in statement.

import urllib2 beautifulsoup import beautifulsoup  #this searchquery ncbi. spaces replaced +'s. searchquery = 'viruses[orgn]+not+retroviridae[orgn]' #this database searching. database = 'protein' #this output file data output = 'sample.fasta'   #this base url ncbi eutils. base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' #create search string information above esearch = 'esearch.fcgi?db='+database+'&term='+searchquery+'&usehistory=y' #create esearch url url = base + esearch #fetch esearch using urllib2 print url content = urllib2.urlopen(url) #open url in beautifulsoup doc = beautifulsoup(content) #grab amount of hits in search count = int(doc.find('count').string) #grab webenv or history of search usehistory. webenv = doc.find('webenv').string #grab querykey querykey = doc.find('querykey').string #set max amount of files fetch @ time. default 500 files. retmax = 10000 #create fetch string efetch = 'efetch.fcgi?db='+database+'&webenv='+webenv+'&query_key='+querykey #select output format , file format of files.  #for table visit: http://www.ncbi.nlm.nih.gov/books/nbk25499/table/chapter4.chapter4_table1 format = 'fasta' type = 'text' #create options string efetch options = '&rettype='+format+'&retmode='+type   #for statement 0 count counting retmax. use xrange on range in xrange(0,count,retmax):     #create position string     poision = '&retstart='+str(i)+'&retmax='+str(retmax)     #create efetch url     url = base + efetch + poision + options     print url     #grab results     response = urllib2.urlopen(url)     #write output file     open(output, 'a') file:         line in response.readlines():             file.write(line)     #gives sense of     print count - - retmax 

to download files using multiple threads:

#!/usr/bin/env python import shutil contextlib import closing multiprocessing.dummy import pool # use threads urllib2 import urlopen  def generate_urls(some, params): #xxx pass whatever parameters need     restart in range(*params):         # ... generate url, filename         yield url, filename  def download((url, filename)):     try:         closing(urlopen(url)) response, open(filename, 'wb') file:             shutil.copyfileobj(response, file)     except exception e:         return (url, filename), repr(e)     else: # success         return (url, filename), none  def main():     pool = pool(20) # @ 20 concurrent downloads     urls = generate_urls(some, params)     (url, filename), error in pool.imap_unordered(download, urls):         if error not none:            print("can't download {url} {filename}, "                  "reason: {error}".format(**locals())  if __name__ == "__main__":    main() 

Comments

Popular posts from this blog

php - Magento - Deleted Base url key -

javascript - Tooltipster plugin not firing jquery function when button or any click even occur -

java - WrongTypeOfReturnValue exception thrown when unit testing using mockito -