day_5:动态渲染页面爬取
发布日期:2021-08-15 22:29:35 浏览次数:34 分类:技术文章

本文共 4581 字,大约阅读时间需要 15 分钟。

一、Selenium

1、声明浏览器对象

from selenium import webdriverbrowser_chrome = webdriver.Chrome()browser_firefox = webdriver.Firefox()browser_edge = webdriver.Edge()browser_phantomjs = webdriver.PhantomJS()browser_safari = webdriver.safari()

2、访问页面

from selenium import webdriverbrowser = webdriver.Chrome()browser.get('https://www.taobao.com')print(browser.page_source)browser.close()

3、查找节点

# 定位单个节点find_element_by_idfind_element_by_namefind_element_by_xpathfind_element_by_link_text #通过精确文本定位find_element_by_partial_link_text #通过模糊文本定位find_element_by_tag_namefind_element_by_class_namefind_element_by_css_selector# 定位多个节点find_elements_by_namefind_elements_by_xpathfind_elements_by_link_textfind_elements_by_partial_link_textfind_elements_by_tag_namefind_elements_by_class_namefind_elements_by_css_selector# 公共方法find_element(By.x, 'x')find_elements(By.x, 'x')#By类型ID = "id"XPATH = "xpath"LINK_TEXT = "link text"PARTIAL_LINK_TEXT = "partial link text"NAME = "name"TAG_NAME = "tag name"CLASS_NAME = "class name"CSS_SELECTOR = "css selector"
# 4中方法查找淘宝首页搜索框节点from selenium import webdriverbrowser = webdriver.Chrome()browser.get('https://www.taobao.com')input_id = browser.find_element_by_id('q')input_name = browser.find_element_by_name('q')input_css = browser.find_element_by_css_selector('#q')input_xpath = browser.find_element_by_xpath('//*[@id="q"]')print(input_id)print(input_name)print(input_css)print(input_xpath)browser.close()
from selenium import webdriverfrom selenium.webdriver.common.by import Bybrowser = webdriver.Chrome()browser.get('https://www.taobao.com') # find_element(查找方式By,查找值),以下两种方式等价input_first = browser.find_element(By.ID, 'q')input_second = browser.find_element_by_id('q')print(input_first)print(input_second)browser.close()
from selenium import webdriverbrowser = webdriver.Chrome()browser.get('https://www.taobao.com')# 查找淘宝左侧导航栏所有li标签lis = browser.find_elements_by_css_selector('.service-bd li')for li in lis:    print(li)browser.close()
from selenium import webdriverbrowser = webdriver.Chrome()browser.get('https://www.zhihu.com/explore')logo = browser.find_element_by_id('zh-top-link-logo')print(logo)# get_attribute(属性名称)获取节点属性print(logo.get_attribute('class'))print(logo.get_attribute('href'))print(logo.get_attribute('id'))print(logo.get_attribute('data-za-c'))# logo.text 获取节点文本print(logo.text)print(logo.id)# logo.location 获取节点位置print(logo.location)print(logo.parent)print(logo.rect)# logo.tag_name 获取节点名称print(logo.tag_name)print(logo.size)browser.close()

4、节点交互

# 淘宝搜索商品from selenium import webdriverimport timebrowser = webdriver.Chrome()browser.get('https://www.taobao.com')# 查找输入框input = browser.find_element_by_id('q')# 在输入框中输入input.send_keys('iPhone') time.sleep(1)# 清空输入框 input.clear()  # 在输入框中输入input.send_keys('iPad')  # 查找搜索按钮btn = browser.find_element_by_css_selector('.btn-search')# 点击按钮btn.click()

5、动作链(鼠标滑动、拖拽,键盘按键等)

# select下拉框rom selenium.webdriver.support.ui import Selectselect = Select(driver.find_element_by_name('name'))select.select_by_index(index)  # 通过索引定位select.select_by_visible_text("text") # 通过文本定位select.select_by_value(value) # 通过值定位select = Select(driver.find_element_by_id('id'))select.deselect_all() # 取消所有选择
# 拖放:将元素移动一定量,或者移动到另一个元素from selenium.webdriver import ActionChainselement = driver.find_element_by_name("source")target = driver.find_element_by_name("target")action_chains = ActionChains(driver)action_chains.drag_and_drop(element, target).perform()
# cookiesdriver.get("http://www.example.com")cookie = {'name' : 'foo', 'value': 'bar'}driver.add_cookie(cookie)driver.get_cookies()

6、等待(页面加载过慢导致找不到节点,需要等待页面加载完成再找节点)显式等待设置的是最长等待时间,尽量使用显示等待、显等待待设置的是固定的等待时间

# 显式等待from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECdriver = webdriver.Firefox()driver.get("http://somedomain/url_that_delays_loading")try:    element = WebDriverWait(driver, 10).until(        EC.presence_of_element_located((By.ID, "myDynamicElement"))    )finally:    driver.quit() # 隐式等待from selenium import webdriverdriver = webdriver.Firefox()driver.implicitly_wait(10) # secondsdriver.get("http://somedomain/url_that_delays_loading")myDynamicElement = driver.find_element_by_id("myDynamicElement")

7、Headless模式(无界面模式)

from selenium import webdriverchrome_options = webdriver.ChromeOptions()chrome_options.add_argument('--headless')driver = webdriver.Chrome(chrome_options=chrome_options)driver.get('https://www.taobao.com')print(driver.page_source)

 

转载于:https://www.cnblogs.com/jp-mao/p/10041514.html

转载地址:https://blog.csdn.net/weixin_30781107/article/details/97128142 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:Mac Sublime Text 3 配置Python环境及安装插件
下一篇:十二周作品

发表评论

最新留言

很好
[***.229.124.182]2024年04月09日 11时01分03秒

关于作者

    喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!

推荐文章