.NET使用分布式网络爬虫框架DotnetSpider快速开发爬虫功能

您所在的位置:网站首页 爬虫功能 .NET使用分布式网络爬虫框架DotnetSpider快速开发爬虫功能

.NET使用分布式网络爬虫框架DotnetSpider快速开发爬虫功能

2024-07-12 15:10| 来源: 网络整理| 查看: 265

    public class RecommendedRankingSpider : Spider    {        public RecommendedRankingSpider(IOptions options,            DependenceServices services,            ILogger logger) : base(options, services, logger)        {        }

        public static async Task RunAsync()        {            var builder = Builder.CreateDefaultBuilder();            builder.UseSerilog();            builder.UseDownloader();            builder.UseQueueDistinctBfsScheduler();            await builder.Build().RunAsync();        }

        protected override async Task InitializeAsync(CancellationToken stoppingToken = default)        {            // 添加自定义解析            AddDataFlow(new Parser());            // 使用控制台存储器            AddDataFlow(new ConsoleStorage());            // 添加采集请求            await AddRequestsAsync(new Request("https://www.cnblogs.com/aggsite/topdiggs")            {                // 请求超时10秒                Timeout = 10000            });        }

        class Parser : DataParser        {            public override Task InitializeAsync()            {                return Task.CompletedTask;            }

            protected override Task ParseAsync(DataFlowContext context)            {                var recommendedRankingList = new List();                // 网页数据解析                var recommendedList = context.Selectable.SelectList(Selectors.XPath(".//article[@class='post-item']"));                foreach (var news in recommendedList)                {                    var articleTitle = news.Select(Selectors.XPath(".//a[@class='post-item-title']"))?.Value;                    var articleSummary = news.Select(Selectors.XPath(".//p[@class='post-item-summary']"))?.Value?.Replace("\n", "").Replace(" ", "");                    var articleUrl = news.Select(Selectors.XPath(".//a[@class='post-item-title']/@href"))?.Value;

                    recommendedRankingList.Add(new RecommendedRankingModel                    {                        ArticleTitle = articleTitle,                        ArticleSummary = articleSummary,                        ArticleUrl = articleUrl                    });                }

                using (StreamWriter sw = new StreamWriter("recommendedRanking.txt"))                {                    foreach (RecommendedRankingModel model in recommendedRankingList)                    {                        string line = $"文章标题:{model.ArticleTitle}\r\n文章简介:{model.ArticleSummary}\r\n文章地址:{model.ArticleUrl}";                        sw.WriteLine(line+ "\r\n ==========================================================================================");                    }                }                return Task.CompletedTask;            }        }    }



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3