Python - 静态IP池
发布日期:2021-06-30 19:50:34 浏览次数:2 分类:技术文章

本文共 2718 字,大约阅读时间需要 9 分钟。

生成代理

# -*- coding=utf-8 -*-import urllib2import reimport requestsimport randomimport timeclass Proxy():    def init(self):        # 静态ip池        self.pool = []        self.agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:50.0) Gecko/20100101 Firefox/50.0'        self.header = {'User-Agent' : self.agent}    def isAlive(self, ip):        # 代理,字典        proxy = {'http' : ip}        # urllib2        proxy_handler = urllib2.ProxyHandler(proxies= proxy)        opener = urllib2.build_opener(proxy_handler)        urllib2.install_opener(opener= opener)        # 访问百度        baidu_url = 'https://www.baidu.com/'        req = urllib2.Request(url= baidu_url, headers= self.header)        try:            # timeout = 2            res = urllib2.urlopen(req, timeout= 2)            if res.code == 200:                return True            else:                return False        except:            return False    def initPool(self):        haodaili_seed = 'http://www.kuaidaili.com/proxylist/'        # 一共(1,11)        for i in xrange(1, 11):            haodaili_url = haodaili_seed + str(i)            haodaili_page = requests.get(url=haodaili_url)            # print haodaili_page.status_code            haodaili_content = haodaili_page.text            pat = re.compile(r'\s*?
(.*?)\s*?
(\d*?).*?',re.S) proxy_list = re.findall(pattern= pat, string= haodaili_content) for p in proxy_list: url = 'http://'+ p[0]+ ':'+ p[1] self.pool.append(url) def refreshPool(self): # 清洗ip self.pool = filter(lambda ip: self.isAlive(ip), self.pool) # 最终pool中的地址 for ip in self.pool: with open('./ip_proxy.dat', 'a') as f: f.write(ip + '\n') def getProxy(self): rand_index = random.randint(0, len(self.pool)) return self.pool[rand_index] def test(self): print 'pool making...' # 初始化 self.init() # 初始化pool self.initPool() # 清洗pool self.refreshPool() # 随机返回代理ip for i in xrange(len(self.pool)): time.sleep(2) print self.getProxy()if __name__ == '__main__': p = Proxy() p.test()

使用代理

def climb(self, url):        ip = self.getProxy()        proxies = {            "http": ip,            "https": ip        }        anjuke_page= requests.get(url=url, proxies= proxies)        anjuke_loop= (anjuke_page.content.decode(encoding='utf-8', errors='ignore').encode(encoding='utf-8', errors='ignore'))        # 详细信息url,名字,状态,价格,坐标,户型        pat = re.compile(r'
.*?
(.*?).*?', re.S) realty_info = re.findall(pattern=pat, string= anjuke_loop) print realty_info # 价格,电话

转载地址:https://lipenglin.blog.csdn.net/article/details/53574621 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:Java — 个推(demo)
下一篇:python - 多进程spider

发表评论

最新留言

表示我来过!
[***.240.166.169]2024年04月28日 23时10分23秒