xpath的常见操作
发布日期:2021-09-02 02:17:06 浏览次数:2 分类:技术文章

本文共 5225 字,大约阅读时间需要 17 分钟。

1. 获取某一个节点下所有的文本数据:

data = response.xpath('//div[@id="zoomcon"]')content = ''.join(data.xpath('string(.)').extract())

这段代码将获取,div为某一个特定id的所有文本数据:

 

 

 

2. 获取html几点属性的值

>>> response.xpath("//div[@id='zoomtime']").extract()      [u'
\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u536b\u751f\u548c\u8ba1\u5212\u751f\u80b2\u59d4\u5458\u4f1a
2010-10-26\r\n
']>>> response.xpath("//div[@id='zoomtime']/@title").extract()[u'\u53d1\u5e03\u65e5\u671f\uff1a2010-10-26']

这里需要获取的是某一个id下,属性title的值,使用的@title就可以获取到:

 

scrapy的项目结构:

 

 

nhfpc.py

# -*- coding: utf-8 -*-import scrapyimport sysimport hashlibfrom scrapy.contrib.spiders import CrawlSpider, Rulefrom scrapy.contrib.linkextractors import LinkExtractorfrom datetime import *from common_lib import *reload(sys)sys.setdefaultencoding('utf-8')class NhfpcItem(scrapy.Item):    url = scrapy.Field()    name = scrapy.Field()    description = scrapy.Field()    size = scrapy.Field()    dateTime = scrapy.Field()    class NhfpcSpider(scrapy.contrib.spiders.CrawlSpider):    name = "nhfpc"    allowed_domains = ["nhfpc.gov.cn"]    start_urls = (        'http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml',        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml',    )        rules = (        Rule(            LinkExtractor(allow='.*\d{6}/.*'),            callback='parse_item'        ),        Rule(            LinkExtractor(allow='.*201307.*'),            follow=True,        ),    )    def parse_item(self, response):                retList =  response.xpath("//div[@id='zoomtitle']/*/text()").extract()        title = ""                if len(retList) == 0:             retList = response.xpath("//div[@id='zoomtitl']/*/text()").extract()            title =  retList[0].strip()        else:            title = retList[0].strip()                content = ""        data = response.xpath('//div[@id="zoomcon"]')        if len(data) == 0:             data = response.xpath('//div[@id="contentzoom"]')        content = ''.join(data.xpath('string(.)').extract())        pubTime = "1970-01-01 00:00:00"        time = response.xpath("//div[@id='zoomtime']/@title").extract()        if len(time) == 0 :            time = response.xpath("//ucmspubtime/text()").extract()        else:            time = ''.join(time).split(":")[1]        pubTime = ''.join(time)        pubTime = pubTime + " 00:00:00"        #print pubTime        #insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")        insertTime = datetime.now()        webSite = "nhfpc.gov.cn"                values = []        values.append(title)        md5Url=hashlib.md5(response.url.encode('utf-8')).hexdigest()                values.append(md5Url)        values.append(pubTime)        values.append(insertTime)        values.append(webSite)        values.append(content)        values.append(response.url)        #print values        insertDB(values)

 

common_lib.py

#!/usr/bin/python#-*-coding:utf-8-*-'''This file include all the common routine,that are needed inthe crawler project.Author: Justnzhang @(uestczhangchao@qq.com)Time:2014年7月28日15:03:44'''import osimport sysimport MySQLdbfrom urllib import quote, unquoteimport uuidreload(sys)sys.setdefaultencoding('utf-8')def insertDB(dictData):    print "insertDB"    print dictData    id = uuid.uuid1()    try:        conn_local = MySQLdb.connect(host='192.168.30.7',user='xxx',passwd='xxx',db='xxx',port=3306)        conn_local.set_character_set('utf8')        cur_local = conn_local.cursor()        cur_local.execute('SET NAMES utf8;')         cur_local.execute('SET CHARACTER SET utf8;')        cur_local.execute('SET character_set_connection=utf8;')                        values = []#        print values        values.append("2")        values.append("3")        values.append("2014-04-11 00:00:00")        values.append("2014-04-11 00:00:00")        values.append("6")        values.append("7")                cur_local.execute("insert into health_policy values(NULL,%s,%s,%s,%s,%s,%s)",values)        #print "invinsible seperator line-----------------------------------"        conn_local.commit()        cur_local.close()        conn_local.close()    except MySQLdb.Error,e:        print "Mysql Error %d: %s" % (e.args[0], e.args[1])if __name__ == '__main__':    values = [1,2,4]    insertDB(values)

 

SET FOREIGN_KEY_CHECKS=0;-- ------------------------------ Table structure for health_policy-- ----------------------------DROP TABLE IF EXISTS `health_policy`;CREATE TABLE `health_policy` (  `hid` int(11) NOT NULL AUTO_INCREMENT,  `title` varchar(1000) DEFAULT NULL COMMENT '政策标题',  `md5url` varchar(1000) NOT NULL COMMENT '经过MD5加密后的URL',  `pub_time` datetime DEFAULT NULL COMMENT '发布时间',  `inser_time` datetime NOT NULL COMMENT '插入时间',  `website` varchar(1000) DEFAULT NULL COMMENT '来源网站',  `content` longtext COMMENT '政策内容',  `url` varchar(1000) DEFAULT NULL,  PRIMARY KEY (`hid`)) ENGINE=InnoDB AUTO_INCREMENT=594 DEFAULT CHARSET=utf8;

转载地址:https://blog.csdn.net/weixin_34130389/article/details/85718930 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:为什么和什么是 DevOps?
下一篇:org.apache.cxf.interceptor.Fault: No such operation

发表评论

最新留言

哈哈,博客排版真的漂亮呢~
[***.90.31.176]2024年03月31日 18时16分13秒