C#网络爬虫抓取小说

程序员技术 · 公众号 · 程序员 · 2017-09-09 15:40

正文

点击上方“程序员共读”，选择“置顶公众号”

关键时刻，第一时间送达！

阅读目录

1、分析html规则

2、C#完整代码

3、最后效果

心血来潮，想研究下爬虫，爬点小说。

通过百度选择了个小说网站，随便找了一本小书http://www.23us.so/files/article/html/13/13655/index.html。

一、分析html规则

思路是获取小说章节目录，循环目录，抓取所有章节中的内容，拼到txt文本中。最后形成完本小说。

1、获取小说章节目录

通过分析，我在标注的地方获取小说名字及章节目录。

// 获取小说名字
// 所有的章节都在这个table中。

下面是利用正则，获取名字与目录。

// 获取小说名字
Match ma_name = Regex.Match(html, @"");
string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

// 获取章节目录
Regex reg_mulu = new Regex(@"
(.|\n)*?
");
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();

2、获取小说正文内容

通过章节a标签中的url地址，查看章节内容。

通过分析，正文内容在

中。

// 获取正文
Regex reg = new Regex(@"
(.|\n)*?
");
MatchCollection mc = reg.Matches(html_z);
var mat = reg.Match(html_z);
string content = mat.Groups[0].ToString().Replace("
", "").Replace("
", "").Replace(" ", "").Replace("
", "\r\n");

二、C#完整代码

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;

namespace Test.Controllers
{
public class CrawlerController : BaseController
{
// GET: Crawler
public void Index()
{
//抓取整本小说
CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说
string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");

// 获取小说名字
Match ma_name = Regex.Match(html, @"");
string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

// 获取章节目录
Regex reg_mulu = new Regex(@"
(.|\n)*?
");
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();

// 匹配a标签里面的url
Regex tmpreg = new Regex("]+?href=\"([^\"]+)\"[^>]*>([^", RegexOptions.Compiled);
MatchCollection sMC = tmpreg.Matches(mulu);
if (sMC.Count != 0)
{
//循环目录url，获取正文内容
for (int i = 0; i < sMC.Count; i++)
{
//sMC[i].Groups[1].Value
//0是第一章泰山之巅
//1是http://www.23us.so/files/article/html/13/13655/5638725.html
//2是第一章泰山之巅

// 获取章节标题
string title = sMC[i].Groups[2].Value;

// 获取文章内容
string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");

// 获取小说名字,章节中也可以查找名字
//Match ma_name = Regex.Match(html, @"");
//string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

// 获取标题,通过分析h1标签也可以得到章节标题
//string title = html_z.Replace("
", "*").Replace("
", "*").Split('*')[1];

// 获取正文
Regex reg = new Regex(@"
(.|\n)*?
");
MatchCollection mc = reg.Matches(html_z);
var mat = reg.Match(html_z);
string content = mat.Groups[0].ToString().Replace("
", "").Replace("
", "").Replace(" ", "").Replace("
", "\r\n");

// txt文本输出
string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\\", "/") + "Txt/";
Novel(title + "\r\n" + content, name, path);
}
}
}

///
/// 创建文本
///
/// 内容
/// 名字
/// 路径
public void Novel(string content, string name, string path)
{
string Log = content + "\r\n";
// 创建文件夹，如果不存在就创建file文件夹
if (Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}

// 判断文件是否存在，不存在则创建
if (!System.IO.File.Exists(path + name + ".txt"))
{
FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);// 开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);// 开始写入值
sr.Close();
fs.Close();
}
}

public string HttpPost(string Url, string postDataStr)
{
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();

return retString;
}

public string HttpGet(string Url, string postDataStr)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
request.Method = "GET";
HttpWebResponse response;
request.ContentType = "text/html;charset=UTF-8";
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)request.GetResponse();
}

Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();

return retString;
}
}
}

三、最后效果

来源：苍
cnblogs.com/cang12138/p/7464226.html
程序员共读整理发布，转载请联系作者获得授权

【点击成为安卓大神】

推荐文章

程序员小灰 · 49k*15薪！进字节了！
4 天前

OSC开源社区 · 黄仁勋与沈向洋万字对话实录：谈Scaling Law、机器人和爱情……
4 天前

码农翻身 · 字节跳动真的太懂程序员了!
3 天前

CTO肉饼 · 我是怎样和成为亿万富翁三次擦肩而过的
1 周前

OSC开源社区 · 通义灵码SWE-GPT：从静态代码建模迈向软件开发过程长链推理
1 周前

19楼 · 急，在线等！老婆背着我偷偷买验孕棒是几个意思...
8 年前

深广电第一现场 · 【后悔】黄昏恋的骗局人财两空多么痛的领悟
7 年前

班主任家园 · 中小学开学季全攻略，火速收藏（转给家长和学生）
7 年前

黄生看经济 · 2017最牛QIFF+险资扎堆=“赣锋锂业第二”，该股启动迹象明显，后市将火箭式上升！
7 年前

筑龙园林景观 · 这8位国际大咖，是这样设计城市滨水景观的！！
7 年前