Python爬虫urllib使用和进阶详细步骤

Python爬虫urllib使用和进阶详细步骤

发布时间:2024-10-10 23:24:28

详细介绍python爬虫中urllib的使用和进阶技巧。urllib是python标准库中用于处理url的模块,它包含了几个子模块,我们将逐步深入探讨。

  1. urllib基础使用

1.1 导入必要的模块

python
from urllib import request, parse, error

1.2 发送get请求

python

url = "https://www.example.com"
response = request.urlopen(url)
html = response.read().decode('utf-8')
print(html)

1.3 发送post请求

python

data = parse.urlencode({'key1': 'value1', 'key2': 'value2'}).encode('utf-8')
req = request.request(url, data=data, method='post')
response = request.urlopen(req)
html = response.read().decode('utf-8')
print(html)

  1. 处理http头部

2.1 添加自定义头部

python

headers = {
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36'
}
req = request.request(url, headers=headers)
response = request.urlopen(req)

2.2 获取响应头部

python

print(response.getheaders())
print(response.getheader('content-type'))

  1. 处理url

3.1 url编码

python

encoded_url = parse.quote('https://example.com/path with spaces')
print(encoded_url)

3.2 url解码

python

decoded_url = parse.unquote('https://example.com/path%20with%20spaces')
print(decoded_url)

3.3 url参数处理

python

params = {'key1': 'value1', 'key2': 'value2'}
query_string = parse.urlencode(params)
url = f"https://example.com/search?{query_string}"
print(url)

  1. 错误处理
python

try:
response = request.urlopen("https://www.example.com/nonexistent")
except error.httperror as e:
print(f"http error: {e.code}")
except error.urlerror as e:
print(f"url error: {e.reason}")

  1. 使用代理
python

proxy_handler = request.proxyhandler({'http': 'http://127.0.0.1:8080'})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
response = request.urlopen(url)

  1. 处理cookie
python

import http.cookiejar

cookie_jar = http.cookiejar.cookiejar()
opener = request.build_opener(request.httpcookieprocessor(cookie_jar))
request.install_opener(opener)
response = request.urlopen(url)

for cookie in cookie_jar:
print(f"{cookie.name}: {cookie.value}")

  1. 进阶:自定义请求处理器
python

class customhttphandler(request.httphandler):
def http_request(self, req):
req.add_header('custom-header', 'customvalue')
return super().http_request(req)

opener = request.build_opener(customhttphandler)
request.install_opener(opener)
response = request.urlopen(url)

  1. 进阶:处理重定向
python

class noredirecthandler(request.httpredirecthandler):
def http_error_302(self, req, fp, code, msg, headers):
return fp

opener = request.build_opener(noredirecthandler)
request.install_opener(opener)
try:
response = request.urlopen(url)
except error.httperror as e:
if e.code == 302:
print(f"redirect to: {e.headers['location']}")

  1. 进阶:异步请求(结合asyncio)
python

import asyncio
import aiohttp

async def fetch(url):
async with aiohttp.clientsession() as session:
async with session.get(url) as response:
return await response.text()

async def main():
urls = ['https://example.com', 'https://example.org', 'https://example.net']
tasks = [asyncio.create_task(fetch(url)) for url in urls]
results = await asyncio.gather(*tasks)
for url, html in zip(urls, results):
print(f"content length from {url}: {len(html)}")

asyncio.run(main())

  1. 实战示例:爬取网页并解析内容
python

from urllib import request
from bs4 import beautifulsoup

def scrape_website(url):
headers = {
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36'
}
req = request.request(url, headers=headers)

try:
response = request.urlopen(req)
html = response.read().decode('utf-8')

soup = beautifulsoup(html, 'html.parser')

# 提取标题
title = soup.title.string if soup.title else 'no title found'

# 提取所有段落文本
paragraphs = [p.text for p in soup.find_all('p')]

# 提取所有链接
links = [a['href'] for a in soup.find_all('a', href=true)]

return {
'title': title,
'paragraphs': paragraphs,
'links': links
}
except error.httperror as e:
print(f"http error: {e.code}")
except error.urlerror as e:
print(f"url error: {e.reason}")

return none

# 使用示例
result = scrape_website('https://example.com')
if result:
print(f"title: {result['title']}")
print(f"number of paragraphs: {len(result['paragraphs'])}")
print(f"number of links: {len(result['links'])}")

这个详细的指南涵盖了urllib的基础使用到进阶技巧,包括处理不同类型的请求、url操作、错误处理、代理使用、cookie处理,以及一些高级用法如自定义处理器和异步请求。最后的实战示例展示了如何结合beautifulsoup来解析爬取的内容。

在实际应用中,您可能需要根据具体的爬取目标和网站结构来调整这些代码。此外,请始终遵守网站的robots.txt规则和使用条款,确保您的爬虫行为是合法和道德的。

感谢提供:05互联