.NET使用分布式网络爬虫框架DotnetSpider快速开发爬虫功能 |
您所在的位置:网站首页 › 爬虫功能 › .NET使用分布式网络爬虫框架DotnetSpider快速开发爬虫功能 |
public class RecommendedRankingSpider : Spider { public RecommendedRankingSpider(IOptions options, DependenceServices services, ILogger logger) : base(options, services, logger) { } public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder(); builder.UseSerilog(); builder.UseDownloader(); builder.UseQueueDistinctBfsScheduler(); await builder.Build().RunAsync(); } protected override async Task InitializeAsync(CancellationToken stoppingToken = default) { // 添加自定义解析 AddDataFlow(new Parser()); // 使用控制台存储器 AddDataFlow(new ConsoleStorage()); // 添加采集请求 await AddRequestsAsync(new Request("https://www.cnblogs.com/aggsite/topdiggs") { // 请求超时10秒 Timeout = 10000 }); } class Parser : DataParser { public override Task InitializeAsync() { return Task.CompletedTask; } protected override Task ParseAsync(DataFlowContext context) { var recommendedRankingList = new List(); // 网页数据解析 var recommendedList = context.Selectable.SelectList(Selectors.XPath(".//article[@class='post-item']")); foreach (var news in recommendedList) { var articleTitle = news.Select(Selectors.XPath(".//a[@class='post-item-title']"))?.Value; var articleSummary = news.Select(Selectors.XPath(".//p[@class='post-item-summary']"))?.Value?.Replace("\n", "").Replace(" ", ""); var articleUrl = news.Select(Selectors.XPath(".//a[@class='post-item-title']/@href"))?.Value; recommendedRankingList.Add(new RecommendedRankingModel { ArticleTitle = articleTitle, ArticleSummary = articleSummary, ArticleUrl = articleUrl }); } using (StreamWriter sw = new StreamWriter("recommendedRanking.txt")) { foreach (RecommendedRankingModel model in recommendedRankingList) { string line = $"文章标题:{model.ArticleTitle}\r\n文章简介:{model.ArticleSummary}\r\n文章地址:{model.ArticleUrl}"; sw.WriteLine(line+ "\r\n =========================================================================================="); } } return Task.CompletedTask; } } } |
今日新闻 |
推荐新闻 |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |