HTTPX 是 Python 3 的全功能 HTTP 客户端,它提供同步和异步 API,并支持 HTTP/1.1 和 HTTP/2。
HTTPX 建立在公认的可用性之上requests,并为您提供:
HTTPX 项目依赖于这些优秀的库:
以及这些可选安装:
httpx请求与requests请求格式保持一致;
import httpx
params = {'key1': 'value1', 'key2': 'value2'}
r = httpx.get('https://httpbin.org/get', params=params)
print(f"http状态:{r.status_code}, encoding:{r.encoding}, url:{r.url}, text:{r.text}")
# 200, UTF-8, URL('https://httpbin.org/get?key2=value2&key1=value1', 'text balabala...')
# r.status_code == httpx.codes.OK
# r.raise_for_status() 不是 2xx 成功代码的响应引发异常
data={'key': 'value'}
r = httpx.post('https://httpbin.org/post', data=data)
r = httpx.post("https://httpbin.org/post", json=data)
files = {'upload-file': open('report.xls', 'rb')}
r = httpx.post("https://httpbin.org/post", files=files) # 文件上传
r = httpx.post("https://httpbin.org/post", data=data, files=files)
content = b'Hello, world'
r = httpx.post("https://httpbin.org/post", content=content) # 二进制请求
r = httpx.put('https://httpbin.org/put', data={'key': 'value'})
r = httpx.delete('https://httpbin.org/delete')
r = httpx.head('https://httpbin.org/get')
r = httpx.options('https://httpbin.org/get')
# 通过httpx.Request实例请求
# 要将Request实例分派到网络,请创建一个Client实例并使用.send()
request = httpx.Request("GET", "https://example.com")
with httpx.Client() as client:
response = client.send(request)
# 可以使用.build_request()对Request实例进行任意修改
headers = {"X-Api-Key": "...", "X-Client-ID": "ABC123"}
with httpx.Client(headers=headers) as client:
request = client.build_request("GET", "https://api.example.com")
response = client.send(request)
# 二进制内容
with httpx.stream("GET", "https://www.example.com") as r:
for data in r.iter_bytes():
print(data)
# 文本
with httpx.stream("GET", "https://www.example.com") as r:
for text in r.iter_text():
print(text)
# 逐行流式传输文本
with httpx.stream("GET", "https://www.example.com") as r:
for line in r.iter_lines():
print(line)
默认情况下,HTTPX不会跟随所有 HTTP 方法的重定向,尽管这可以显式启用。
>>> r = httpx.get('http://github.com/', follow_redirects=True)
>>> r.url
URL('https://github.com/')
>>> r.status_code
200
>>> r.history
[]
使用httpx.get/post等方法请求api时,HTTPX 必须为每个请求建立一个新连接(连接不被重用)。随着对主机的请求数量增加,这很快就会变得低效。
Client实例使用HTTP 连接池。这意味着当您向同一主机发出多个请求时,Client将重用底层 TCP 连接,而不是为每个请求重新创建一个。
使用顶级 API 相比,这可以带来显著的性能提升,包括:
Client实例还支持顶级 API 中不可用的功能,例如:
httpx.Client()用法
with httpx.Client() as client:
r = client.get('https://example.com')
# 使用以下命令显式关闭连接池而不使用阻塞.close()
client = httpx.Client()
try:
r = client.get('https://example.com')
finally:
client.close()
将httpx客户端配置为使用 WSGI 协议直接调用 Python Web 应用程序.
from flask import Flask
import httpx
app = Flask(__name__)
@app.route("/")
def hello():
return "Hello World!"
with httpx.Client(app=app, base_url="http://testserver") as client:
r = client.get("/")
assert r.status_code == 200
assert r.text == "Hello World!"
import asyncio
import httpx
async def main():
async with httpx.AsyncClient() as client:
response = await client.get('https://www.example.com/')
print(response)
asyncio.run(main())
import httpx
import trio
async def main():
async with httpx.AsyncClient() as client:
response = await client.get('https://www.example.com/')
print(response)
trio.run(main)
import httpx
import anyio
async def main():
async with httpx.AsyncClient() as client:
response = await client.get('https://www.example.com/')
print(response)
anyio.run(main, backend='trio')
python爬虫初学者使用requests+BeautifulSoup库的比较多;
再深入一些地使用scrapy进行数据爬取;
今天来比较一下requests、Httpx库结合BeautifulSoup,parsel库进行页面解析,采用同步、异步、多进程、多线程方式爬取数据,看看那种方式更快!
以下内容参考【大江狗】博客,如有侵权,联系删除!
爬虫的入口url是https://sh.lianjia.com/ershoufang/pudong/a3p5/, 先发送请求获取最大页数,然后循环发送请求解析单个页面提取我们所要的信息(比如小区名,楼层,朝向,总价,单价等信息),最后导出csv文件。
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
def write_csv_file(file_path, data, data_type="list"):
"""爬虫输入写入Excel文件"""
head = ["标题", "小区", "房厅", "面积", "朝向", "楼层", "年份",
"位置", "总价(万)", "单价(元/平方米)"]
keys = ["title", "house", "bedroom", "area", "direction",
"floor", "year", "location",
"total_price", "unit_price"]
try:
with open(file_path, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = csv.writer(csv_file, dialect='excel')
if head is not None:
writer.writerow(head)
if data_type == "list":
for item in data:
row_data = []
for k in keys:
row_data.append(item[k])
# print(row_data)
writer.writerow(row_data)
else:
# 如果队列不为空,写入每行数据
while not data.empty():
item = data.get()
if item:
row_data = []
for k in keys:
row_data.append(item[k])
writer.writerow(row_data)
print("Write a CSV file to path %s Successful." % file_path)
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (file_path, e))
def resp_parse_parsel(response):
""" 页面内容解析
:param response: httpx.get() result
:return:
"""
datas = []
selector = Selector(response.text)
ul = selector.css('ul.sellListContent')[0]
li_list = ul.css('li')
for li in li_list:
detail = dict()
detail['title'] = li.css('div.title a::text').get()
# 2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼
house_info = li.css('div.houseInfo::text').get()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = re.compile(r'\d{1,2}')
match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配
if match1:
detail['floor'] = match1.group()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = re.compile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = match2.group()
else:
detail['year'] = "未知"
# 文兰小区 - 塘桥 提取小区名和哈快
position_info = li.css('div.positionInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = re.compile(r'\d+')
total_price = li.css('div.totalPrice span::text').get()
detail['total_price'] = re.search(price_pattern, total_price).group()
# 单价64182元/平米, 匹配64182
unit_price = li.css('div.unitPrice span::text').get()
detail['unit_price'] = re.search(price_pattern, unit_price).group()
datas.append(detail)
return datas
# 解析单页面,需传入单页面url地址
def parse_single_page1(url, q):
print("子进程开始爬取:{}".format(url))
# response = httpx.get(url, headers=self.headers)
response = httpx.get(url)
li = resp_parse_parsel(response)
for t in li:
q.put(t)
class RequestSpider(object):
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.data = []
self.file_path = "home.csv"
self.url = URL
def get_max_page(self, parse_type="bs4"):
"""获取页面最大页数"""
r = requests.get(self.url, headers=self.headers)
if r.status_code == 200:
if parse_type == "bs4":
soup = BeautifulSoup(r.text, 'html.parser')
d = soup.select('div[class="page-box house-lst-page-box"]')
else:
selector = Selector(r.text)
d = selector.css('div[class="page-box house-lst-page-box"]') # 采用css选择器获取最大页码div Boxl
max_page = eval(d[0].attrs["page-data"])[
"totalPage"] # page-data="{"totalPage":12,"curPage":1}" eval转换为dict
return max_page
else:
print(f"请求失败: {r.status_code}")
return None
def parse_page_bs4(self):
"""页面解析,解析类型bs4"""
max_page = self.get_max_page()
for i in range(1, max_page + 1):
url = URL_1.format(i)
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
ul = soup.find_all("ul", class_="sellListContent")
li_list = ul[0].select("li")
for li in li_list:
detail = dict()
detail['title'] = li.select('div[class="title"]')[0].get_text()
# 2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼
house_info = li.select('div[class="houseInfo"]')[0].get_text()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = re.compile(r'\d{1,2}')
# 从字符串任意位置匹配
match1 = re.search(floor_pattern, house_info_list[4])
if match1:
detail['floor'] = match1.group()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = re.compile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = match2.group()
else:
detail['year'] = "未知"
# 文兰小区 - 塘桥, 提取小区名和哈快
position_info = li.select('div[class="positionInfo"]')[0].get_text().split(' - ')
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = re.compile(r'\d+')
total_price = li.select('div[class="totalPrice totalPrice2"]')[0].get_text()
detail['total_price'] = re.search(price_pattern, total_price).group()
# 单价64182元/平米, 匹配64182
unit_price = li.select('div[class="unitPrice"]')[0].get_text()
detail['unit_price'] = re.search(price_pattern, unit_price).group()
self.data.append(detail)
def parse_page_parsel(self):
"""页面解析,解析类型parsel"""
max_page = self.get_max_page()
for i in range(1, max_page + 1):
url = URL_1.format(i)
response = requests.get(url, headers=self.headers)
li = resp_parse_parsel(response)
self.data.extend(li)
if __name__ == '__main__':
# requests+BeautifulSoup
req = RequestSpider()
req.parse_page_bs4()
write_csv_file(req.file_path, req.data)
def parse_page_parsel(self):
"""页面解析,解析类型parsel"""
max_page = self.get_max_page()
for i in range(1, max_page + 1):
url = URL_1.format(i)
response = requests.get(url, headers=self.headers)
li = resp_parse_parsel(response)
self.data.extend(li)
if __name__ == '__main__':
req = RequestSpider()
req.parse_page_parsel()
write_csv_file(req.file_path, req.data)
class HttpxSpider(object):
def __init__(self):
self.ua = UserAgent(use_cache_server=False)
self.headers = {"User-Agent": self.ua.random}
self.data = []
self.file_path = "home.csv"
self.url = URL
def get_max_page(self):
"""获取页面最大页数"""
print(self.headers)
r = requests.get(self.url, headers=self.headers)
if r.status_code == 200:
selector = Selector(r.text)
d = selector.css('div[class="page-box house-lst-page-box"]') # 采用css选择器获取最大页码div Boxl
max_page = eval(d[0].xpath('//@page-data').get())["totalPage"]
return max_page
else:
print(f"请求失败: {r.status_code}")
return None
def parse_page(self):
max_page = self.get_max_page()
for i in range(1, max_page + 1):
url = URL_1.format(i)
response = httpx.get(url, headers=self.headers)
li = resp_parse_parsel(response)
self.data.extend(li)
if __name__ == '__main__':
# httpx同步 + parsel组合
req1 = HttpxSpider()
req1.parse_page()
write_csv_file(req1.file_path, req1.data)
Httpx厉害的地方就是能发送异步请求。整个异步爬虫实现原理时,先发送同步请求获取最大页码,把每个单页的爬取和数据解析变为一个asyncio协程任务(使用async定义),最后使用loop执行。
class AsyncHttpxSpider(HttpxSpider):
# 异步 - 使用协程函数解析单页面,需传入单页面url地址
async def parse_single_page(self, url):
async with httpx.AsyncClient() as client:
response = await client.get(url, headers=self.headers)
li = resp_parse_parsel(response)
self.data.extend(li)
def parse_page(self):
max_page = self.get_max_page()
loop = asyncio.get_event_loop()
# Python 3.6之前用ayncio.ensure_future或loop.create_task方法创建单个协程任务
# Python 3.7以后可以用户asyncio.create_task方法创建单个协程任务
tasks = []
for i in range(1, max_page + 1):
url = URL_1.format(i)
tasks.append(self.parse_single_page(url))
# 还可以使用asyncio.gather(*tasks)命令将多个协程任务加入到事件循环
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if __name__ == "__main__":
req2 = AsyncHttpxSpider()
req2.parse_page()
write_csv_file(req2.file_path, req2.data)
class MultiProcessHttpxSpider(HttpxSpider):
def __init__(self):
super().__init__()
self.q = Manager().Queue() # 因为多进程之间不能共享内存,需使用队列Queue共享数据进行通信
if __name__ == "__main__":
# win环境下,多进程需要__main__下操作
req3 = MultiProcessHttpxSpider()
max_page = req3.get_max_page()
urls = [URL_1.format(i) for i in range(1, max_page + 1)]
pool = Pool(processes=cpu_count())
ll = []
for url in urls:
ll.append(pool.apply_async(parse_single_page1, args=(url, req3.q)))
pool.close()
pool.join()
write_csv_file(req3.file_path, req3.q, data_type="queue")
class ThreadHttpxSpider(HttpxSpider):
# 解析单页面,需传入单页面url地址
def parse_single_page(self, url):
print("多线程开始爬取:{}".format(url))
response = httpx.get(url, headers=self.headers)
li = resp_parse_parsel(response)
self.data.extend(li)
def parse_page(self):
max_page = self.get_max_page()
thread_list = []
for i in range(1, max_page + 1):
url = URL_1.format(i)
t = threading.Thread(target=self.parse_single_page, args=(url,))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
if __name__ == "__main__":
req4 = ThreadHttpxSpider()
req4.parse_page()
write_csv_file(req4.file_path, req4.data)
爬取同样的内容,采用不同工具组合耗时是不一样的。httpx异步爬虫和多线程爬虫毫无疑问是最大的赢家。多进程, 多线程和异步协程均可以提高Python爬虫的工作效率。对于爬虫这种非计算密集型的工作,多进程编程对效率的提升不如多线程和异步协程。异步爬虫不总是最快的,同步爬虫+多线程也同样可以很快,有时甚至更快。
留言与评论(共有 0 条评论) “” |