S05 - 网页图片的爬取及本地保存
# coding=utf-8 import requests from lxml import etree url = 'http://www.spiderbuf.cn/playground/s05' myheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} html = requests.get(url, headers=myheaders)...
S04 - 分页参数分析及翻页爬取
# coding=utf-8 import requests from lxml import etree import re base_url = 'https://www.spiderbuf.cn/playground/s04?pageno=%d' myheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} # 取页数 html = reque...
S03 - lxml库进阶语法及解析练习
# coding=utf-8 import requests from lxml import etree url = 'http://www.spiderbuf.cn/playground/s03' myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} html = requests.get(url, headers=myheaders).text p...
S02 - http请求分析及头构造使用
# coding=utf-8 import requests from lxml import etree url = 'http://www.spiderbuf.cn/playground/s02' myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} html = requests.get(url, headers=myheaders).text p...
S01 - requests库及lxml库入门
# coding=utf-8 import requests from lxml import etree url = 'https://www.spiderbuf.cn/playground/s01' html = requests.get(url).text f = open('01.html', 'w', encoding='utf-8') f.write(html) f.close() root = etree.HTML(html) trs = root.xpath('//tr') f = open('data01.txt', 'w', encoding='utf-8') ...