Python爬虫
requests
## 安装requests pip install requests sudo pip install requests #遇到Permission denied安装失败,请加上sudo重试 ## 使用requests import requests response = requests.get('https://www.douban.com/') ## 豆瓣首页 #传入参数 response = requests.get('https://www.douban.com/search', params={'q': 'python', 'cat': '1001'}) #传入HTTP Header response = requests.get('https://www.douban.com/', headers={'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit', "Accept-Language": "en-US"}) #传入cookies response = requests.get(url, cookies= {'token': '12345', 'status': 'working'}) #控制重定向 response = requests.get(url, allow_redirects=False) ## 超时设置 response = requests.get(url, timeout=(3.05, 27)) ## 连接超时 3.05s,读取超时 27s #流式请求: 处理大响应时,逐块接收数据 response = requests.get(url, stream=True) for chunk in response.iter_content(chunk_size=8192): process(chunk) #数据发送 response = requests.post(url, json={"title": "Hello", "body": "World"}) #JSON 数据 Content-Type: application/json response = requests.post('https://www.douban.com/search', params={'q': 'python', 'cat': '1001'}) #表单数据 Content-Type: application/x-www-form-urlencoded ## 手动指定编码(如遇乱码) response.encoding = "gbk" ## 针对中文网页 ## 异常处理 try: response.raise_for_status() ## 非 2xx/3xx 状态码抛出异常 except requests.HTTPError as e: print(f"请求失败: {e}") ## 单文件上传 with open("report.pdf", "rb") as f: files = {"document": f} response = requests.post(url, files=files) #多文件/混合数据 files = { "image": ("cat.jpg", open("cat.jpg", "rb"), "image/jpeg"), "metadata": ("data.json", json.dumps({"tag": "animal"}), "application/json") } response = requests.post(url, files=files) ## 会话管理(Session) 持久化配置和连接池复用 with requests.Session() as session: session.headers.update({"User-Agent": "MyApp/1.0"}) session.auth = ("user", "pass") ## 首次登录保存 Cookie login_resp = session.post(login_url, data=credentials) ## 后续请求自动携带 Cookie profile_resp = session.get(profile_url) ## SSL/TLS 安全 response = requests.get(url, verify=False) ## 禁用验证(不推荐) response = requests.get(url, verify="/path/to/ca-bundle.crt") #自定义证书 response = requests.get(url, cert=("/path/client.cert", "/path/client.key")) #客户端证书认证 ## 认证机制 ## Basic Auth: from requests.auth import HTTPBasicAuth response = requests.get(url, auth=HTTPBasicAuth("user", "pass")) ## Digest Auth: from requests.auth import HTTPDigestAuth response = requests.get(url, auth=HTTPDigestAuth("user", "pass")) ## OAuth 1.0: from requests_oauthlib import OAuth1 auth = OAuth1("client_key", "client_secret", "token", "token_secret") response = requests.get(url, auth=auth) ## 代理配置: 支持 HTTP/S 和 SOCKS 代理 proxies = { "http": "http://10.10.1.10:3128", "https": "socks5://user:pass@host:port" } response = requests.get(url, proxies=proxies) ## 错误处理: try: resp = requests.get(url, timeout=5) resp.raise_for_status() except requests.RequestException as e: logging.error(f"Request failed: {str(e)}") ## 响应属性 response.status_code ## HTTP 状态码(如 200, 404)if response.status_code == 200: response.text ## 解码后的文本内容(自动检测编码)print(response.text[:100]) response.content ## 返回响应的内容,以字节为单位 with open("image.png", "wb") as f: f.write(response.content) response.json() ## 解析 JSON 为字典data = response.json() response.headers ## 响应头(字典形式)content_type = response.headers["Content-Type"] response.cookies ## 服务器返回的 Cookies,print(response.cookies.get("session_id")) response.cookies['ts'] response.history ## 重定向历史记录 for resp in response.history: print(resp.url) response.url ## 实际请求的URL response.encoding ## 使用encoding属性查 response.close() ## 关闭与服务器的连接 response.elapsed ## 返回一个 timedelta 对象,包含了从发送请求到响应到达之间经过的时间量,可以用于测试响应速度。比如 r.elapsed.microseconds 表示响应到达需要多少微秒。 response.history ## 返回包含请求历史的响应对象列表(url) response.is_redirect ## 如果响应被重定向,则返回 True,否则返回 False response.iter_lines() ## 迭代响应的行 response.links ## 返回响应的解析头链接 response.next ## 返回重定向链中下一个请求的 PreparedRequest 对象 response.ok ## 检查 "status_code" 的值,如果小于400,则返回 True,如果不小于 400,则返回 False response.reason ## 响应状态的描述,比如 "Not Found" 或 "OK" response.request #返回请求此响应的请求对象 response.iter_content() ## 迭代响应 response.apparent_encoding ## 编码方式 response.raise_for_status() ## 如果发生错误,方法返回一个 HTTPError 对象 response.is_permanent_redirect ## 如果响应是永久重定向的 url,则返回 True,否则返回 False ## requests 方法 delete(url, args) ## 发送 DELETE 请求到指定 url get(url, params, args) ## 发送 GET 请求到指定 url head(url, args) ## 发送 HEAD 请求到指定 url patch(url, data, args) ## 发送 PATCH 请求到指定 url post(url, data, json, args) ## 发送 POST 请求到指定 url put(url, data, args) ## 发送 PUT 请求到指定 url request(method, url, args) ## 向指定的 url 发送指定的请求方法
顶部
收展
底部
[TOC]
目录
requests
BeautifulSoup
selenium
Scrapy
相关推荐
Python3基础教程