新闻中心
新闻中心与新手教程
新闻中心与新手教程
发布时间:2024-10-10 23:24:28
详细介绍python爬虫中urllib的使用和进阶技巧。urllib是python标准库中用于处理url的模块,它包含了几个子模块,我们将逐步深入探讨。
1.1 导入必要的模块
from urllib import request, parse, error
1.2 发送get请求
url = "https://www.example.com"
response = request.urlopen(url)
html = response.read().decode('utf-8')
print(html)
1.3 发送post请求
data = parse.urlencode({'key1': 'value1', 'key2': 'value2'}).encode('utf-8')
req = request.request(url, data=data, method='post')
response = request.urlopen(req)
html = response.read().decode('utf-8')
print(html)
2.1 添加自定义头部
headers = {
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36'
}
req = request.request(url, headers=headers)
response = request.urlopen(req)
2.2 获取响应头部
print(response.getheaders())
print(response.getheader('content-type'))
3.1 url编码
encoded_url = parse.quote('https://example.com/path with spaces')
print(encoded_url)
3.2 url解码
decoded_url = parse.unquote('https://example.com/path%20with%20spaces')
print(decoded_url)
3.3 url参数处理
params = {'key1': 'value1', 'key2': 'value2'}
query_string = parse.urlencode(params)
url = f"https://example.com/search?{query_string}"
print(url)
try:
response = request.urlopen("https://www.example.com/nonexistent")
except error.httperror as e:
print(f"http error: {e.code}")
except error.urlerror as e:
print(f"url error: {e.reason}")
proxy_handler = request.proxyhandler({'http': 'http://127.0.0.1:8080'})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
response = request.urlopen(url)
import http.cookiejar
cookie_jar = http.cookiejar.cookiejar()
opener = request.build_opener(request.httpcookieprocessor(cookie_jar))
request.install_opener(opener)
response = request.urlopen(url)
for cookie in cookie_jar:
print(f"{cookie.name}: {cookie.value}")
class customhttphandler(request.httphandler):
def http_request(self, req):
req.add_header('custom-header', 'customvalue')
return super().http_request(req)
opener = request.build_opener(customhttphandler)
request.install_opener(opener)
response = request.urlopen(url)
class noredirecthandler(request.httpredirecthandler):
def http_error_302(self, req, fp, code, msg, headers):
return fp
opener = request.build_opener(noredirecthandler)
request.install_opener(opener)
try:
response = request.urlopen(url)
except error.httperror as e:
if e.code == 302:
print(f"redirect to: {e.headers['location']}")
import asyncio
import aiohttp
async def fetch(url):
async with aiohttp.clientsession() as session:
async with session.get(url) as response:
return await response.text()
async def main():
urls = ['https://example.com', 'https://example.org', 'https://example.net']
tasks = [asyncio.create_task(fetch(url)) for url in urls]
results = await asyncio.gather(*tasks)
for url, html in zip(urls, results):
print(f"content length from {url}: {len(html)}")
asyncio.run(main())
from urllib import request
from bs4 import beautifulsoup
def scrape_website(url):
headers = {
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.124 safari/537.36'
}
req = request.request(url, headers=headers)
try:
response = request.urlopen(req)
html = response.read().decode('utf-8')
soup = beautifulsoup(html, 'html.parser')
# 提取标题
title = soup.title.string if soup.title else 'no title found'
# 提取所有段落文本
paragraphs = [p.text for p in soup.find_all('p')]
# 提取所有链接
links = [a['href'] for a in soup.find_all('a', href=true)]
return {
'title': title,
'paragraphs': paragraphs,
'links': links
}
except error.httperror as e:
print(f"http error: {e.code}")
except error.urlerror as e:
print(f"url error: {e.reason}")
return none
# 使用示例
result = scrape_website('https://example.com')
if result:
print(f"title: {result['title']}")
print(f"number of paragraphs: {len(result['paragraphs'])}")
print(f"number of links: {len(result['links'])}")
这个详细的指南涵盖了urllib的基础使用到进阶技巧,包括处理不同类型的请求、url操作、错误处理、代理使用、cookie处理,以及一些高级用法如自定义处理器和异步请求。最后的实战示例展示了如何结合beautifulsoup来解析爬取的内容。
在实际应用中,您可能需要根据具体的爬取目标和网站结构来调整这些代码。此外,请始终遵守网站的robots.txt规则和使用条款,确保您的爬虫行为是合法和道德的。
感谢提供:05互联