全国新闻资讯采集模块完善
parent
75e2ba93cd
commit
55a3a10528
|
|
@ -17,10 +17,12 @@ namespace New_College.Tasks
|
|||
///
|
||||
/// </summary>
|
||||
private ID_NewsInfoServices newsInfoServices;
|
||||
private ISysRegionServices sysRegionServices;
|
||||
// 这里可以注入
|
||||
public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices)
|
||||
public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices, ISysRegionServices sysRegionServices)
|
||||
{
|
||||
newsInfoServices = d_NewsInfoServices;
|
||||
this.sysRegionServices = sysRegionServices;
|
||||
}
|
||||
|
||||
public Task StartAsync(CancellationToken cancellationToken)
|
||||
|
|
@ -32,42 +34,81 @@ namespace New_College.Tasks
|
|||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private void DoWork(object state)
|
||||
private async void DoWork(object state)
|
||||
{
|
||||
try
|
||||
//try
|
||||
//{
|
||||
// HtmlAgHelper agHelper = new HtmlAgHelper();
|
||||
// var list = agHelper.HtmlCreatePageData();
|
||||
// list.ForEach(async c =>
|
||||
// {
|
||||
// var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
||||
// if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
||||
// {
|
||||
|
||||
// await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
||||
// {
|
||||
// Author = c.author,
|
||||
// CategoryId = 1,
|
||||
// CreateBy = "spdier",
|
||||
// ProvinceCode = "370000",
|
||||
// CreateId = 1,
|
||||
// CreateTime = c.pubtime,
|
||||
// Detail = c.detail,
|
||||
// CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
||||
// OrderSort = 0,
|
||||
// IsDelete = false,
|
||||
// Title = c.title,
|
||||
// Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||
// });
|
||||
|
||||
// }
|
||||
// });
|
||||
// ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
||||
//}
|
||||
//catch (Exception ex)
|
||||
//{
|
||||
// ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
||||
//}
|
||||
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
|
||||
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
|
||||
provineInfo.ForEach(p =>
|
||||
{
|
||||
HtmlAgHelper agHelper = new HtmlAgHelper();
|
||||
var list = agHelper.HtmlCreatePageData();
|
||||
list.ForEach(async c =>
|
||||
try
|
||||
{
|
||||
var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
||||
if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
||||
var list = anghelper.HtmlCreatePageData(p.RegionCode);
|
||||
list.ForEach(async c =>
|
||||
{
|
||||
|
||||
await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
||||
var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
||||
if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
||||
{
|
||||
Author = c.author,
|
||||
CategoryId = 1,
|
||||
CreateBy = "spdier",
|
||||
ProvinceCode = "370000",
|
||||
CreateId = 1,
|
||||
CreateTime = c.pubtime,
|
||||
Detail = c.detail,
|
||||
CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
||||
OrderSort = 0,
|
||||
IsDelete = false,
|
||||
Title = c.title,
|
||||
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||
});
|
||||
await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
||||
{
|
||||
Author = c.author,
|
||||
CategoryId = 1,
|
||||
CreateBy = "spdier",
|
||||
ProvinceCode = p.RegionCode,
|
||||
CreateId = 1,
|
||||
CreateTime = c.pubtime,
|
||||
Detail = c.detail,
|
||||
CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
||||
OrderSort = 0,
|
||||
IsDelete = false,
|
||||
Title = c.title,
|
||||
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||
});
|
||||
|
||||
}
|
||||
});
|
||||
ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
});
|
||||
ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
||||
}
|
||||
}
|
||||
|
||||
public Task StopAsync(CancellationToken cancellationToken)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,102 @@
|
|||
using Aliyun.OSS;
|
||||
using HtmlAgilityPack;
|
||||
using Microsoft.AspNetCore.Mvc.RazorPages;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Text.Json;
|
||||
using New_College.Common.Helper;
|
||||
using System.Threading;
|
||||
namespace New_College.Tasks
|
||||
{
|
||||
public class NationWideNewsAgHelper
|
||||
{
|
||||
|
||||
private string itemUrl = "https://gaokao.chsi.com.cn";
|
||||
public List<NewsModels> HtmlCreatePageData(string provinceCode)
|
||||
{
|
||||
HtmlWeb webClient = new HtmlWeb();
|
||||
|
||||
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
|
||||
var list = new List<NewsModels>();
|
||||
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
|
||||
using var httpClient = new HttpClient();
|
||||
var response = httpClient.GetAsync(baseUrl).Result;
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
var jsonData = response.Content.ReadAsStringAsync().Result;
|
||||
Console.WriteLine(jsonData);
|
||||
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData);
|
||||
if (resultlist.msg.Any())
|
||||
{
|
||||
resultlist.msg.ForEach(o =>
|
||||
{
|
||||
Thread.Sleep(100);
|
||||
string newsUrl = $"{itemUrl}{o.uri}";
|
||||
HtmlDocument doc = webClient.Load(newsUrl);
|
||||
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText;
|
||||
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", "");
|
||||
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
|
||||
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
|
||||
{
|
||||
string href = linkNode.GetAttributeValue("href", "");
|
||||
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
|
||||
{
|
||||
string fullUrl = "https://gaokao.chsi.com.cn" + href;
|
||||
linkNode.SetAttributeValue("href", fullUrl);
|
||||
}
|
||||
}
|
||||
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
|
||||
if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
|
||||
{
|
||||
|
||||
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
|
||||
{
|
||||
innerhtml = $"{innerhtml}\n若有附件详情,请至本省招生考试院下载附件!!!";
|
||||
}
|
||||
|
||||
list.Add(new NewsModels()
|
||||
{
|
||||
title = inntertitle,
|
||||
author = author,
|
||||
pubtime = Convert.ToDateTime(createtime),
|
||||
detail = innerhtml.Replace("src=\"", "src=\"https://gaokao.chsi.com.cn")
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return list;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class GaokaoObject
|
||||
{
|
||||
public List<Msg> msg { get; set; }
|
||||
public bool flag { get; set; }
|
||||
}
|
||||
|
||||
public class Msg
|
||||
{
|
||||
public string title { get; set; }
|
||||
public string truncTitle { get; set; }
|
||||
public string uri { get; set; }
|
||||
public string displayDate { get; set; }
|
||||
}
|
||||
Loading…
Reference in New Issue