在Python爬虫开发中,我们经常会遇到程序突然暂停或“假死”的情况。这种现象不仅影响了爬虫的效率,还可能导致数据采集不完整。本文将深入探讨Python爬虫暂停的常见原因,并详细介绍相应的解决方案。一...
在Python爬虫开发中,我们经常会遇到程序突然暂停或“假死”的情况。这种现象不仅影响了爬虫的效率,还可能导致数据采集不完整。本文将深入探讨Python爬虫暂停的常见原因,并详细介绍相应的解决方案。
retry功能。import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def get_html(url): session = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) session.mount('http://', HTTPAdapter(max_retries=retries)) session.mount('https://', HTTPAdapter(max_retries=retries)) try: response = session.get(url, timeout=10) response.raise_for_status() return response.text except requests.exceptions.HTTPError as e: print(f"HTTPError: {e}") except requests.exceptions.ConnectionError as e: print(f"ConnectionError: {e}") except requests.exceptions.Timeout as e: print(f"Timeout: {e}") except requests.exceptions.RequestException as e: print(f"RequestException: {e}")
# 使用示例
url = "http://example.com"
html = get_html(url)
print(html)import requests
from fake_useragent import UserAgent
def get_html(url): ua = UserAgent() headers = { 'User-Agent': ua.random } try: response = requests.get(url, headers=headers) response.raise_for_status() return response.text except requests.exceptions.HTTPError as e: print(f"HTTPError: {e}") except requests.exceptions.ConnectionError as e: print(f"ConnectionError: {e}") except requests.exceptions.Timeout as e: print(f"Timeout: {e}") except requests.exceptions.RequestException as e: print(f"RequestException: {e}")
# 使用示例
url = "http://example.com"
html = get_html(url)
print(html)from bs4 import BeautifulSoup
def parse_html(html): soup = BeautifulSoup(html, 'lxml') title = soup.find('title').text print(title)
# 使用示例
html = """
Example
Hello, World!
"""
parse_html(html)import requests
from concurrent.futures import ThreadPoolExecutor
def fetch(url): try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.HTTPError as e: print(f"HTTPError: {e}") except requests.exceptions.ConnectionError as e: print(f"ConnectionError: {e}") except requests.exceptions.Timeout as e: print(f"Timeout: {e}") except requests.exceptions.RequestException as e: print(f"RequestException: {e}")
def fetch_all(urls): with ThreadPoolExecutor(max_workers=5) as executor: results = executor.map(fetch, urls) for result in results: print(result)
# 使用示例
urls = ["http://example.com"] * 10
fetch_all(urls)# 示例代码
def main(): try: # 代码逻辑 pass except Exception as e: print(f"Error: {e}")
if __name__ == "__main__": main()Python爬虫暂停的原因多种多样,解决这些问题需要我们具备一定的网络知识、编程技巧和问题排查能力。通过以上方法,我们可以有效地解决Python爬虫暂停问题,提高爬虫的稳定性和效率。