一、从网页上下载指定文件

urlretrieve()方法直接将远程数据下载到本地。

urlretrieve(url, filename=None, reporthook=None, data=None)

url - 指定了从哪下载

finename - 指定了保存本地路径（如果参数未指定，urllib会生成一个临时文件保存数据。）

reporthook - 是一个回调函数，当连接上服务器、以及相应的数据块传输完毕时会触发该回调，我们可以利用这个回调函数来显示当前的下载进度。

data - 指post到服务器的数据，该方法返回一个包含两个元素的(filename, headers)元组，filename表示保存到本地的路径，header表示服务器的响应头。

from urllib.request import urlopenfrom bs4 import BeautifulSouphtml = urlopen("http://www.pythonscraping.com")bsObj = BeautifulSoup(html, "lxml")# 找到图片的地址imageLocation = bsObj.find("a", {
   "id": "logo"}).find("img")["src"]# 下载图片并保存未logo.jpgurlretrieve(imageLocation, "logo.jpg")

二、下载带有指定src标签的文件

import osfrom urllib.request import urlretrievefrom urllib.request import urlopenfrom bs4 import BeautifulSoupdownloadDirectory = "downloaded/"baseUrl = "http://pythonscraping.com"# 对URL链接进行清理和标准化，获得文件的绝对路径（而且去掉了外链）def getAbsoluteURL(baseUrl, source):    if source.startswith("http://www."):        url = "http://" + source[11:]    elif source.startswith("http://"):        url = source    elif source.startswith("www."):        url = "http://"+source[4:]    else:        url = baseUrl+"/"+source    if baseUrl not in url:        return None    return url# 去除目录中的特殊符号def correct_title(title):    error_set = ['/', '\\', ':', '*', '?', '"', '|', '<', '>']    for c in title:        if c in error_set:            title = title.replace(c, '')    return title# 获得下载目录def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):    path = absoluteUrl.replace("www.", "")    path = path.replace(baseUrl, "")    path = correct_title(path)    path = downloadDirectory+path    # directory - 目录，用于检查该文件夹下是否已存在文件夹    directory = os.path.dirname(path)    if not os.path.exists(directory):        os.makedirs(directory)    return pathhtml = urlopen("http://www.pythonscraping.com")bsObj = BeautifulSoup(html, "lxml")# 选择首页上所有带 src 属性的标签downloadList = bsObj.findAll(src=True)for download in downloadList:    fileUrl = getAbsoluteURL(baseUrl, download["src"])    if fileUrl is not None:        print(fileUrl)        try:            urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))        except BaseException as e:            print(str(e))        else:            continue

三、保存网页的数据到CSV

import csvfrom urllib.request import urlopenfrom bs4 import BeautifulSouphtml = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")bsObj = BeautifulSoup(html, "lxml")# 主对比表格是当前页面上的第一个表格table = bsObj.findAll("table", {
   "class": "wikitable"})[0]rows = table.findAll("tr")csvFile = open("../files/editors.csv", 'wt', newline='', encoding='utf-8')writer = csv.writer(csvFile)try:    for row in rows:        csvRow = []        for cell in row.findAll(['td', 'th']):            csvRow.append(cell.get_text())        writer.writerow(csvRow)finally:    csvFile.close()

四、随机漫步

import  matplotlib.pyplot as pltfrom random import choiceclass RandomWalk():    # 一个随机漫步数据的类    def __init__(self, num_points = 5000):        #初始化随机漫步的属性        self.num_points = num_points        # 所有随机漫步都始于(0, 0)        self.x_values = [0]        self.y_values = [0]    def fill_walk(self):        #计算随机漫步包含的所有点        # 不断漫步，直到列表达到指定的长度        while len(self.x_values) < self.num_points:            # 决定前进方向以及沿这个方向前进的距离            # x轴方向上 1 - 向右走 -1 - 向左走            x_direction = choice([1, -1])            x_distance = choice([0, 1, 2, 3, 4])            #为零将垂直移动            x_step = x_direction * x_distance            # y轴方向上 1 - 向上走 -1 - 向下走            y_direction = choice([1, -1])            y_distance = choice([0, 1, 2, 3, 4])            #为零将垂直移动            y_step = y_direction * y_distance            # 拒绝原地踏步            if x_step == 0 and y_step == 0:                continue            # 计算下一个点的x和y值            next_x = self.x_values[-1] + x_step            next_y = self.y_values[-1] + y_step            self.x_values.append(next_x)            self.y_values.append(next_y)# 只要程序处于活动状态，就不断地模拟随机漫步# 创建一个RandomWalk实例，并将其包含的点都绘制出来rw = RandomWalk()rw.fill_walk()# 设置绘图窗口的尺寸函数figure()用于指定图表的宽度、高度、分辨率和背景色。你需要给形参figsize指定一个元组，向matplotlib指出绘图窗口的尺寸，单位为英寸plt.figure(figsize=(10, 6))point_numbers = list(range(rw.num_points))plt.scatter(rw.x_values, rw.y_values, c=point_numbers, cmap=plt.cm.Blues, edgecolor='none', s=1)# 突出起点和终点plt.scatter(0, 0, c='green', edgecolors='none', s=100)plt.scatter(rw.x_values[-1], rw.y_values[-1], c='red', edgecolors='none',            s=100)# 隐藏坐标轴plt.axes().get_xaxis().set_visible(False)plt.axes().get_yaxis().set_visible(False)plt.show()

五、调用API查看github高星Python语言

import requests# 执行API调用并存储响应url = 'https://api.github.com/search/repositories?q=language:python&sort=stars'r = requests.get(url)print("Status code:", r.status_code)# 将API响应存储在一个变量中response_dict = r.json()print("Total repositories:", response_dict['total_count'])# 探索有关仓库的信息repo_dicts = response_dict['items']print("Repositories returned:", len(repo_dicts))print("\nSelected information about each repository:")for repo_dict in repo_dicts:    print('\nName:', repo_dict['name'])    print('Owner:', repo_dict['owner']['login'])    print('Stars:', repo_dict['stargazers_count'])    print('Repository:', repo_dict['html_url'])    print('Description:', repo_dict['description'])

转载地址：https://darkness.blog.csdn.net/article/details/109314970 如侵犯您的版权，请留言回复原文章的地址，我们会给您删除此文章，给您带来不便请您谅解！

上一篇：【机器学习】线性回归ex1data1

下一篇：Python3网络爬虫数据采集（1~3）

发表评论

关于作者

喝酒易醉，品茶养心，人生如梦，品茶悟道，何以解忧？唯有杜康！

-- 愿君每日到此一游！

一、从网页上下载指定文件

二、下载带有指定src标签的文件

三、保存网页的数据到CSV

四、随机漫步

五、调用API查看github高星Python语言

发表评论

最新留言

关于作者

推荐文章