diff --git a/New_College.Common/Helper/HtmlHelper.cs b/New_College.Common/Helper/HtmlHelper.cs index 5d02c33..7a6e1b4 100644 --- a/New_College.Common/Helper/HtmlHelper.cs +++ b/New_College.Common/Helper/HtmlHelper.cs @@ -17,7 +17,7 @@ if (length > 0 && strText.Length > length) return strText.Substring(0, length); - return strText; + return strText.Trim(); } #endregion } diff --git a/New_College.Extensions/ServiceExtensions/JobSetup.cs b/New_College.Extensions/ServiceExtensions/JobSetup.cs index f88d7fa..4bb2240 100644 --- a/New_College.Extensions/ServiceExtensions/JobSetup.cs +++ b/New_College.Extensions/ServiceExtensions/JobSetup.cs @@ -16,6 +16,7 @@ namespace New_College.Extensions //services.AddHostedService(); //services.AddHostedService(); + services.AddHostedService(); services.AddHostedService(); services.AddSingleton(); services.AddTransient();//Job使用瞬时依赖注入 diff --git a/New_College.Tasks/HostedService/JobTimedSpiderService.cs b/New_College.Tasks/HostedService/JobTimedSpiderService.cs new file mode 100644 index 0000000..3756d63 --- /dev/null +++ b/New_College.Tasks/HostedService/JobTimedSpiderService.cs @@ -0,0 +1,79 @@ +using New_College.Common.Helper; +using Microsoft.Extensions.Hosting; +using System; +using System.Threading; +using System.Threading.Tasks; +using New_College.IServices; +using System.Linq; +using New_College.Model.Models; +using System.Collections.Generic; + +namespace New_College.Tasks +{ + public class JobTimedSpiderService : IHostedService, IDisposable + { + private Timer _timer; + /// + /// + /// + private ID_NewsInfoServices newsInfoServices; + // 这里可以注入 + public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices) + { + newsInfoServices = d_NewsInfoServices; + } + + public Task StartAsync(CancellationToken cancellationToken) + { + Console.WriteLine("Job spider is starting."); + + _timer = new Timer(DoWork, null, TimeSpan.Zero, + TimeSpan.FromSeconds(60 * 60 * 8));//两个小时 + return Task.CompletedTask; + } + + private void DoWork(object state) + { + HtmlAgHelper agHelper = new HtmlAgHelper(); + var list = agHelper.HtmlCreatePageData(); + list.ForEach(async c => + { + var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title); + if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1) + { + + await newsInfoServices.Add(new Model.Models.D_NewsInfo() + { + Author = c.author, + CategoryId = 1, + CreateBy = "spdier", + CreateId = 1, + CreateTime = c.pubtime, + Detail = c.detail, + CoverImg = "https://static-data.ycymedu.com/static/newstop.png", + OrderSort = 0, + IsDelete = false, + Title = c.title, + Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail) + }); + + } + }); + ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}"); + } + + public Task StopAsync(CancellationToken cancellationToken) + { + Console.WriteLine("Job spider is stopping."); + + _timer?.Change(Timeout.Infinite, 0); + + return Task.CompletedTask; + } + + public void Dispose() + { + _timer?.Dispose(); + } + } +} diff --git a/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs b/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs new file mode 100644 index 0000000..9a7576c --- /dev/null +++ b/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs @@ -0,0 +1,56 @@ +using Aliyun.OSS.Model; +using HtmlAgilityPack; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace New_College.Tasks +{ + public class HtmlAgHelper + { + + public List HtmlCreatePageData() + { + var list = new List(); + HtmlWeb webClient = new HtmlWeb(); + HtmlDocument doc = webClient.Load("https://www.sdzk.cn/NewsList.aspx?BCID=2"); + HtmlNodeCollection categoryNodeList = doc.DocumentNode.SelectNodes("//*[@id=\"ctl00_ContentPlaceHolder1_ctl00_ContentPlaceHolder1_RadListView1Panel\"]/ul/li"); + var listurls = new List(); + foreach (var item in categoryNodeList)// + { + var aa = item.ChildNodes[0]; + Console.WriteLine(string.Format("https://www.sdzk.cn/{0}|{1}", item.ChildNodes[0].Attributes["href"].Value, item.InnerText)); + listurls.Add(string.Format("https://www.sdzk.cn/{0}", item.ChildNodes[0].Attributes["href"].Value)); + } + listurls.ForEach(url => + { + var doc = webClient.Load(url); + var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/h3").InnerText; + var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("作者:")[1].Split(" ")[0]; + var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("发布时间:")[1]; + var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/div").InnerHtml; + list.Add(new NewsModels() + { + title = inntertitle, + author = author, + pubtime = Convert.ToDateTime(createtime), + detail = innerhtml + }); + }); + return list; + } + + + } + + public class NewsModels + { + public string title { get; set; } + public string author { get; set; } + public DateTime pubtime { get; set; } + public string detail { get; set; } + } + +} diff --git a/New_College.Tasks/New_College.Tasks.csproj b/New_College.Tasks/New_College.Tasks.csproj index 9bf2dfe..64f00c4 100644 --- a/New_College.Tasks/New_College.Tasks.csproj +++ b/New_College.Tasks/New_College.Tasks.csproj @@ -5,6 +5,7 @@ + diff --git a/New_Spider/HtmlAgNewsHelper.cs b/New_Spider/HtmlAgNewsHelper.cs new file mode 100644 index 0000000..8a1dad8 --- /dev/null +++ b/New_Spider/HtmlAgNewsHelper.cs @@ -0,0 +1,44 @@ +using HtmlAgilityPack; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace New_Spider +{ + public class HtmlAgNewsHelper + { + public async void HtmlCreatePageData() + { + HtmlWeb webClient = new HtmlWeb(); + HtmlDocument doc = webClient.Load("https://www.sdzk.cn/NewsList.aspx?BCID=2"); + + HtmlNodeCollection categoryNodeList = doc.DocumentNode.SelectNodes("//*[@id=\"ctl00_ContentPlaceHolder1_ctl00_ContentPlaceHolder1_RadListView1Panel\"]/ul/li"); + var listurls = new List(); + foreach (var item in categoryNodeList)// + { + var aa = item.ChildNodes[0]; + Console.WriteLine(string.Format("https://www.sdzk.cn/{0}|{1}", item.ChildNodes[0].Attributes["href"].Value, item.InnerText)); + listurls.Add(string.Format("https://www.sdzk.cn/{0}", item.ChildNodes[0].Attributes["href"].Value)); + } + + listurls.ForEach(url => + { + + var doc = webClient.Load(url); + + var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/h3").InnerText; + + var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("作者:")[1].Split(" ")[0]; + var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("发布时间:")[1]; + + var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/div").InnerHtml; + + + + }); + + } + } +} diff --git a/New_Spider/New_Spider.csproj b/New_Spider/New_Spider.csproj index 91f8ed1..df01b65 100644 --- a/New_Spider/New_Spider.csproj +++ b/New_Spider/New_Spider.csproj @@ -9,7 +9,7 @@ - + diff --git a/New_Spider/Program.cs b/New_Spider/Program.cs index 4be4f6c..ea59bf0 100644 --- a/New_Spider/Program.cs +++ b/New_Spider/Program.cs @@ -6,10 +6,12 @@ using System.Text.RegularExpressions; //HtmlAgHelper htmlAgHelper = new HtmlAgHelper(); //htmlAgHelper.HtmlCreatePageData(); -HtmlAgMajorHelper agMajorHelper = new HtmlAgMajorHelper(); +//HtmlAgMajorHelper agMajorHelper = new HtmlAgMajorHelper(); //agMajorHelper.DownloadTypeListFile(); -agMajorHelper.DownloadChildTypeListFile(); +//agMajorHelper.DownloadChildTypeListFile(); +HtmlAgNewsHelper agNewsHelper = new HtmlAgNewsHelper(); +agNewsHelper.HtmlCreatePageData(); Console.Read();