Python爬取小说技术指南（python爬取文章）

off999 2025-05-11 00:13 36 浏览 0 评论

在 Python 中爬取小说需要遵循法律法规和网站的服务条款，请确保你有权获取目标内容。以下是使用 Python 爬取小说的通用技术流程

安装依赖库

pip install requests beautifulsoup4 -i https://pypi.tuna.tsinghua.edu.cn/simple

示例代码

import requests
from bs4 import BeautifulSoup
import time

# 目标目录页URL
url = "http://www.bequ6.org/144_144551/"

# 模拟浏览器请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 获取目录页内容
def get_chapter_links(url):
    try:
        response = requests.get(url, headers=headers)
        response.encoding = "utf-8"  # 根据实际网页编码调整
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 假设章节链接在<div id="list">下的<dd><a>标签中（需根据实际HTML结构调整）
        chapter_list = soup.find("div", id="list")
        links = chapter_list.find_all("a")
        
        # 拼接完整URL并返回
        base_url = url.rsplit("/", 2)[0]  # 获取基础URL
        return [f"{base_url}/{link.get('href')}" for link in links]
    except Exception as e:
        print(f"获取章节链接失败: {e}")
        return []

# 获取单章内容
def get_chapter_content(chapter_url):
    try:
        response = requests.get(chapter_url, headers=headers)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 假设标题在<h1>标签中
        title = soup.find("h1").text.strip()
        
        # 假设正文在<div id="content">中
        content = soup.find("div", id="content").text.strip()
        
        return title, content
    except Exception as e:
        print(f"获取章节内容失败: {chapter_url} - {e}")
        return None, None

# 主程序
if __name__ == "__main__":
    # 获取所有章节链接
    chapter_links = get_chapter_links(url)
    print(f"共找到 {len(chapter_links)} 个章节")
    
    # 遍历章节并保存内容
    with open("novel_content.txt", "w", encoding="utf-8") as f:
        for link in chapter_links:
            title, content = get_chapter_content(link)
            if title and content:
                f.write(f"\n\n{title}\n\n")
                f.write(content)
                print(f"已保存章节: {title}")
            time.sleep(1)  # 防止请求过快被封
    
    print("小说内容已保存到 novel_content.txt")