一 pycurl介绍
pycurl模块为libcurl库提供了一个python接口。libcurl是一个开源免费且方便快捷的基于客户端的url传输库,支持FTP,HTTP,HTTPS,IMAP,IMAPS,LDAP,LDAPS,POP3,POP3S,RTMP,RTSP,SCP等等。libcurl还支持SSL认证,HTTP POST,HTTP PUT,FTP UPLOADING等等。和urllib模块类似,pycurl模块也可以用来获取一个url的对象。pycurl使用了大部分libcurl提供的函数,使得pycurl具有以下特性:
快速 libcurl本身就很快速,pycurl就是对libcurl进行了一次封装,所以pycurl同样很快速。
支持多种协议,SSL,认证和代理选项。pycurl支持大部分libcurl的回调函数。
multi 和 share 接口支持
可以和应用的I/O整合
二 pycurl使用案例
1.安装pycurl
CentOS6 下使用pip install pycurl安装
可以使用ipython来调试
2.获取一个url响应结果
import pycurl from StringIO import StringIO buffer=StringIO() c=pycurl.Curl() c.setopt(c.URL,'http://pycurl.io/') c.setopt(c.WRITEFUNCTION,buffer.write) c.perform() c.close() body=buffer.getvalue() print(body)
pycurl本身不会存储url的响应结果,因此,需要设置一个buffer,让pycurl将结果写入到这个buffer中
想要获取调试信息,可以设置
c.setopt(c.VERBOSE, True)
等同于 curl -v
3.审查响应头
在实际案例中,我们想要根据服务端的编码格式来解码响应结果
import pycurlimport retry: from io import BytesIOexcept ImportError: from StringIO import StringIO as BytesIO headers={}def header_function(header_line): # HTTP standard specifies that headers are encoded in iso-8859-1. # On Python 2, decoding step can be skipped. # On Python 3, decoding step is required. header_line=header_line.decode('iso-8859-1') # Header lines include the first status line (HTTP/1.x ...). # We are going to ignore all lines that don't have a colon in them. # This will botch headers that are split on multiple lines... if ':' not in header_line: return # Break the header line into header name and value. name, value = header_line.split(':', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. headers[name] = valuebuffer=BytesIO()c=pycurl.Curl()c.setopt(c.URL,'http://pycurl.io')c.setopt(c.WRITEFUNCTION,buffer.write)#set our header functionc.setopt(c.HEADERFUNCTION,header_function)c.perform()c.close() # Figure out what encoding was sent with the response, if any.# Check against lowercased header name.encoding=Noneif 'content-type' in headers: content_type=headers['content-type'].lower() match=re.search('charset=(\S+)', content_type) if match: encoding=match.group(1) print('Decoding using %s' % encoding) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding='iso-8859-1' print('Assuming encoding is %s' % encoding) body=buffer.getvalue()# Decode using the encoding we figured out.print(body.decode(encoding))
|
4.将响应结果写入到文件
import pycurl with open('out.html','wb') as f: c=pycurl.Curl() c.setopt(c.URL,'http://pycurl.io/') c.setopt(c.WRITEDATA,f) c.perform() c.close()
这里最重要的部分就是以二进制模式打开文件,这样响应结果可以以字节码写入到文件中,不需要编码和解码。
5.跟踪url跳转
libcurl和pycurl默认不跟踪url跳转。
import pycurlc=pycurl.Curl()#Redirects to https://www.python.org/.c.setopt(c.URL,'http://www.python.org/')#Follow redirectc.setopt(c.FOLLOWLOCATION,True)c.perform()c.close()
6.审查响应
import pycurltry: from io import BytesIOexcept ImportError: from StringIO import StringIO as BytesIO buffer=BytesIO()c=pycurl.Curl()c.setopt(c.URL,'http://www.python.org/')c.setopt(c.WRITEFUNCTION,buffer.write)c.perform() #Last used URLprint('Effective_url: %s' %c.getinfo(c.EFFECTIVE_URL))#HTTP response codeprint('Response_code: %d' %c.getinfo(c.RESPONSE_CODE))#Total time of previous transferprint('Total_time: %f' %c.getinfo(c.TOTAL_TIME))#Time from start until name resolving completedprint('Namelookup_time: %f' %c.getinfo(c.NAMELOOKUP_TIME))#Time from start until remote host or proxy completedprint('Connect_time: %f' %c.getinfo(c.CONNECT_TIME))#Time from start until SLL/SSH handshake completedprint('SSL/SSH_time: %f' %c.getinfo(c.APPCONNECT_TIME))#Time from start until just before the transfer beginsprint('Pretransfer_time: %f' %c.getinfo(c.PRETRANSFER_TIME))#Time from start until just when the first byte is receivedprint('Starttransfer_time: %f' %c.getinfo(c.STARTTRANSFER_TIME))#Time taken for all redirect steps before the final transferprint('Redirect_time: %f' %c.getinfo(c.REDIRECT_TIME))#Total number of redirects that were followedprint('Redirect_count: %d' %c.getinfo(c.REDIRECT_COUNT))#URL a redirect would take you to,had you enabled redirectsprint('Redirect_url: %s' %c.getinfo(c.REDIRECT_URL))#Number of bytes uploadedprint('Size_upload: %d' %c.getinfo(c.SIZE_UPLOAD))#Average upload speedprint('Speed_upload: %f' %c.getinfo(c.SPEED_UPLOAD))#Number of bytes downloadedprint('Size_download: %d' %c.getinfo(c.SIZE_DOWNLOAD))#Average download speedprint('Speed_download: %f' %c.getinfo(c.SPEED_DOWNLOAD)) #getinfo must be called before closec.close()
# python response_info.py Effective_url: http://www.python.org/Response_code: 301Total_time: 0.105395Namelookup_time: 0.051208Connect_time: 0.078317SSL/SSH_time: 0.000000Pretransfer_time: 0.078322Starttransfer_time: 0.105297Redirect_time: 0.000000Redirect_count: 0Redirect_url: https://www.python.org/Size_upload: 0Speed_upload: 0.000000Size_download: 0Speed_download: 0.000000
| |
| |
7.发送表单数据
发送表单数据使用POSTFIELDS参数
import pycurltry: #python 3 from urllib.parse import urlencodeexcept ImportError: from urllib import urlencode c=pycurl.Curl()c.setopt(c.URL,'http://pycurl.io/tests/testpostvars.php') post_data={'field':'value'}#Form data must be provided already urlencodedpostfields=urlencode(post_data)# Sets request method to POST,# Content-Type header to application/x-www-form-urlencoded# and data to send in request body.c.setopt(c.POSTFIELDS, postfields) c.perform()c.close()
8.文件上传
上传文件使用HTTPPOST参数,上传一个物理文件,使用FORM_FILE
import pycurl c = pycurl.Curl()c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php') c.setopt(c.HTTPPOST, [ ('fileupload', ( # upload the contents of this file c.FORM_FILE, __file__, )),]) c.perform()c.close()
为上传的文件设置不同的文件名和内容类型
import pycurl c = pycurl.Curl()c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php') c.setopt(c.HTTPPOST, [ ('fileupload', ( # upload the contents of this file c.FORM_FILE, __file__, # specify a different file name for the upload c.FORM_FILENAME, 'helloworld.py', # specify a different content type c.FORM_CONTENTTYPE, 'application/x-python', )),]) c.perform()c.close()
| |
如果文件数据在内存中,使用BUFFER/BUFFERPTR
import pycurl c = pycurl.Curl()c.setopt(c.URL, 'http://pycurl.io/tests/testfileupload.php') c.setopt(c.HTTPPOST, [ ('fileupload', ( c.FORM_BUFFER, 'readme.txt', c.FORM_BUFFERPTR, 'This is a fancy readme file', )),]) c.perform()c.close()
9.处理FTP协议
import pycurl c = pycurl.Curl()c.setopt(c.URL, 'ftp://ftp.sunet.se/')c.setopt(c.FTP_USE_EPSV, 1)c.setopt(c.QUOTE, ['cwd pub', 'type i'])c.perform()c.close()
10.Sharing Data
import pycurlimport threading print >>sys.stderr, 'Testing', pycurl.version class Test(threading.Thread): def __init__(self, share): threading.Thread.__init__(self) self.curl = pycurl.Curl() self.curl.setopt(pycurl.URL, 'http://curl.haxx.se') self.curl.setopt(pycurl.SHARE, share) def run(self): self.curl.perform() self.curl.close() s = pycurl.CurlShare()s.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_COOKIE)s.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_DNS) t1 = Test(s)t2 = Test(s) t1.start()t2.start()del s
11.使用multi接口
libcurl的easy接口是一个同步的,高效的,上手快的用于文件传输的接口。multi接口是一个异步的接口,它可以使用一个或者多个线程进行多路传输。
multi接口比easy接口多了以下几个功能:
提供一个pull接口。使用libcurl的应用决定哪里何时询问libcurl去接收或者发送数据
在同一个线程中启动多路同步传输而不必使应用程序变得更复杂
使得应用程序同时等待在应用程序本身的文件描述符和libcurl文件描述符上的动作变得简单许多
使得基于事件处理和扩展的传输可以达到上千个并行连接
例1
import pycurl m = pycurl.CurlMulti()m.handles = []c1 = pycurl.Curl()c2 = pycurl.Curl()c1.setopt(c1.URL, 'http://curl.haxx.se')c2.setopt(c2.URL, 'http://cnn.com')c2.setopt(c2.FOLLOWLOCATION, 1)m.add_handle(c1)m.add_handle(c2)m.handles.append(c1)m.handles.append(c2) num_handles = len(m.handles)while num_handles: while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break m.select(1.0) m.remove_handle(c2)m.remove_handle(c1)del m.handlesm.close()c1.close()c2.close()
| |
例2
import os, systry: from cStringIO import StringIOexcept ImportError: from StringIO import StringIOimport pycurl urls = ( "http://curl.haxx.se", "http://www.python.org", "http://pycurl.sourceforge.net", "http://pycurl.sourceforge.net/tests/403_FORBIDDEN", # that actually exists ;-) "http://pycurl.sourceforge.net/tests/404_NOT_FOUND",) # Read list of URIs from file specified on commandlinetry: urls = open(sys.argv[1], "rb").readlines()except IndexError: # No file was specified pass # initm = pycurl.CurlMulti()m.handles = []for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url.strip() c.body = StringIO() c.http_code = -1 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) c.setopt(c.FOLLOWLOCATION,True) m.add_handle(c) # get datanum_handles = len(m.handles)while num_handles: while 1: ret, num_handles = m.perform() print ret,num_handles if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(1.0) # close handlesfor c in m.handles: # save info in standard Python attributes c.http_code = c.getinfo(c.HTTP_CODE) # pycurl API calls m.remove_handle(c) c.close()m.close() # print resultfor c in m.handles: data = c.body.getvalue() if 0: print "**********", c.url, "**********" print data else: print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))
| |
例3
import os, systry: from cStringIO import StringIOexcept ImportError: from StringIO import StringIOimport pycurl urls = ( "http://curl.haxx.se", "http://www.python.org", "http://pycurl.sourceforge.net", "http://pycurl.sourceforge.net/THIS_HANDLE_IS_CLOSED",) # initm = pycurl.CurlMulti()m.handles = []for url in urls: c = pycurl.Curl() # save info in standard Python attributes c.url = url c.body = StringIO() c.http_code = -1 c.debug = 0 m.handles.append(c) # pycurl API calls c.setopt(c.URL, c.url) c.setopt(c.WRITEFUNCTION, c.body.write) c.setopt(c.FOLLOWLOCATION,True) m.add_handle(c) # debug - close a handleif 1: c = m.handles[3] c.debug = 1 c.close() # get datanum_handles = len(m.handles)while num_handles: while 1: ret, num_handles = m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # currently no more I/O is pending, could do something in the meantime # (display a progress bar, etc.) m.select(1.0) # close handlesfor c in m.handles: # save info in standard Python attributes try: c.http_code = c.getinfo(c.HTTP_CODE) except pycurl.error: # handle already closed - see debug above assert c.debug c.http_code = -1 # pycurl API calls if 0: m.remove_handle(c) c.close() elif 0: # in the C API this is the wrong calling order, but pycurl # handles this automatically c.close() m.remove_handle(c) else: # actually, remove_handle is called automatically on close c.close()m.close() # print resultfor c in m.handles: data = c.body.getvalue() if 0: print "**********", c.url, "**********" else: print "%-53s http_code %3d, %6d bytes" % (c.url, c.http_code, len(data))
|
可以使用multi接口来缩短访问很多url的时间
假设一个文件中包含了很多个url,现在要通过脚本去访问每个url判断返回码是不是200
文件中共有87个url
方法一 使用python的for语句顺序访问每个url
import os,sysimport pycurlfrom StringIO import StringIO try: if sys.argv[1]=="-": urls=sys.stdin.readlines() else: urls=open(sys.argv[1],'rb').readlines() #print urlsexcept: print "Usage: %s check_urls.txt" %sys.argv[0] raise SystemExit class Curl: def __init__(self,url): self.url=url self.body=StringIO() self.http_code=0 self._curl=pycurl.Curl() self._curl.setopt(pycurl.URL,self.url) self._curl.setopt(pycurl.WRITEFUNCTION,self.body.write) self._curl.setopt(pycurl.FOLLOWLOCATION,True) self._curl.setopt(pycurl.NOSIGNAL,1) def perform(self): self._curl.perform() def close(self): self.http_code=self._curl.getinfo(pycurl.HTTP_CODE) self._curl.close() for url in urls: url=url.strip() if not url or url[0] == '#': continue c=Curl(url) c.perform() c.close() print url, c.http_code
real 2m46.134suser 0m0.134ssys 0m0.185s
| |
| |
方法二 使用pycurl的CurlMulti()函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | from StringIO import StringIO import pycurl # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see # the libcurl tutorial for more info. try : import signal from signal import SIGPIPE,SIG_ING signal.signal(signal.SIGPIPE,signal.SIG_IGN) except ImportError: pass # need a given txt file contains urls try : if sys.argv[ 1 ] = = "-" : urls = sys.stdin.readlines() else : urls = open (sys.argv[ 1 ], 'rb' ).readlines() #print urls except : print "Usage: %s check_urls.txt <file with urls to check>" % sys.argv[ 0 ] raise SystemExit class Curl: def __init__( self ,url): self .url = url self .body = StringIO() self .http_code = 0 self ._curl = pycurl.Curl() self ._curl.setopt(pycurl.URL, self .url) self ._curl.setopt(pycurl.FOLLOWLOCATION, True ) self ._curl.setopt(pycurl.WRITEFUNCTION, self .body.write) self ._curl.setopt(pycurl.NOSIGNAL, 1 ) self ._curl.debug = 0 def perform( self ): self ._curl.perform() def close( self ): try : self .http_code = self ._curl.getinfo(pycurl.HTTP_CODE) except pycurl.error: assert c.debug self .http_code = 0 self ._curl.close() def print_result(items): for c in items: data = c.body.getvalue() if 0 : print "***************" ,c.url, "******************" print data elif 1 : print "%-60s %3d %6d" % (c.url,c.http_code, len (data)) def test_multi(): handles = [] m = pycurl.CurlMulti() for url in urls: url = url.strip() if not url or url[ 0 ] = = '#' : continue c = Curl(url) m.add_handle(c._curl) handles.append(c) while 1 : ret,num_handles = m.perform() if ret! = pycurl.E_CALL_MULTI_PERFORM: break while num_handles: m.select( 5.0 ) while 1 : ret,num_handles = m.perform() if ret! = pycurl.E_CALL_MULTI_PERFORM: break for c in handles: c.close() m.close() print_result(handles) if 1 : test_multi() |
1 2 3 | real 2m46.049s user 0m0.082s sys 0m0.132s |
在pycurl作者给的案例中,使用CurlMulti()接口处理多个url速度是最快的,但是当url数量多时速度并不快,而且有部分url还不能获取正确的返回值
方法三 使用python的多线程模块
python由于有GIL全局解释器锁的存在,python提供的threading模块不能充分利用多线程的优势,在多核CPU服务器上,统一时刻实际上只有一个线程在运行,其他线程都处于锁定状态。所以python的threading模块不适合用于处理CPU密集型任务,相反,threading线程数据量越多,速度越慢。但是对于I/O密集型或者网络密集型任务,还是可以使用threading模块
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | import os,sys,time import threading import Queue try : from cStringIO import StringIO except ImportError: from StringIO import StringIO import pycurl # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see # the libcurl tutorial for more info. try : import signal from signal import SIGPIPE,SIG_ING signal.signal(signal.SIGPIPE,signal.SIG_IGN) except ImportError: pass # need a given txt file contains urls try : if sys.argv[ 1 ] = = "-" : urls = sys.stdin.readlines() else : urls = open (sys.argv[ 1 ], 'rb' ).readlines() #print urls except : print "Usage: %s check_urls.txt <file with urls to check>" % sys.argv[ 0 ] raise SystemExit class Curl: def __init__( self ,url): self .url = url self .body = StringIO() self .http_code = 0 self ._curl = pycurl.Curl() self ._curl.setopt(pycurl.URL, self .url) self ._curl.setopt(pycurl.FOLLOWLOCATION, True ) self ._curl.setopt(pycurl.CONNECTTIMEOUT, 15 ) self ._curl.setopt(pycurl.TIMEOUT, 15 ) self ._curl.setopt(pycurl.WRITEFUNCTION, self .body.write) self ._curl.setopt(pycurl.NOSIGNAL, 1 ) self ._curl.debug = 0 def perform( self ): self ._curl.perform() def close( self ): try : self .http_code = self ._curl.getinfo(pycurl.HTTP_CODE) except pycurl.error: assert c.debug self .http_code = 0 self ._curl.close() queue = Queue.Queue() for url in urls: url = url.strip() if not url or url[ 0 ] = = "#" : continue queue.put(url) assert queue.queue, "no urls are given" num_urls = len (queue.queue) #num_conn=min(num_conn,num_urls) num_conn = num_urls #assert 1 <= num_conn < = 1000,"invalid number of concurrent connections" class WorkerThread(threading.Thread): def __init__( self ,queue): threading.Thread.__init__( self ) self .queue = queue def run( self ): while 1 : try : url = self .queue.get_nowait() except Queue.Empty: raise SystemExit c = Curl(url) c.perform() c.close() print "http_url:" + url + "\t" + "http_code:" + str (c.http_code) #start a bunch of threads threads = [] for dummy in range (num_conn): t = WorkerThread(queue) t.start() threads.append(t) #wait for all threads to finish for thread in threads: thread.join() |
1 2 3 | real 0m10.500s user 0m0.149s sys 0m0.196s |
可以看到时间明显比以上两种方法所短了很多
所以,对于有大量url需要用pycurl来处理时,应该结合threading模块
参考资料: