全国新闻资讯采集模块完善

develop
old易 2024-12-11 15:34:23 +08:00
parent 75e2ba93cd
commit 55a3a10528
2 changed files with 174 additions and 31 deletions

View File

@ -17,10 +17,12 @@ namespace New_College.Tasks
/// ///
/// </summary> /// </summary>
private ID_NewsInfoServices newsInfoServices; private ID_NewsInfoServices newsInfoServices;
private ISysRegionServices sysRegionServices;
// 这里可以注入 // 这里可以注入
public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices) public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices, ISysRegionServices sysRegionServices)
{ {
newsInfoServices = d_NewsInfoServices; newsInfoServices = d_NewsInfoServices;
this.sysRegionServices = sysRegionServices;
} }
public Task StartAsync(CancellationToken cancellationToken) public Task StartAsync(CancellationToken cancellationToken)
@ -32,24 +34,60 @@ namespace New_College.Tasks
return Task.CompletedTask; return Task.CompletedTask;
} }
private void DoWork(object state) private async void DoWork(object state)
{
//try
//{
// HtmlAgHelper agHelper = new HtmlAgHelper();
// var list = agHelper.HtmlCreatePageData();
// list.ForEach(async c =>
// {
// var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
// if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
// {
// await newsInfoServices.Add(new Model.Models.D_NewsInfo()
// {
// Author = c.author,
// CategoryId = 1,
// CreateBy = "spdier",
// ProvinceCode = "370000",
// CreateId = 1,
// CreateTime = c.pubtime,
// Detail = c.detail,
// CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
// OrderSort = 0,
// IsDelete = false,
// Title = c.title,
// Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
// });
// }
// });
// ConsoleHelper.WriteWarningLine($"Job spider success {DateTime.Now}-{list.Count}");
//}
//catch (Exception ex)
//{
// ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
//}
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
provineInfo.ForEach(p =>
{ {
try try
{ {
HtmlAgHelper agHelper = new HtmlAgHelper(); var list = anghelper.HtmlCreatePageData(p.RegionCode);
var list = agHelper.HtmlCreatePageData();
list.ForEach(async c => list.ForEach(async c =>
{ {
var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title); var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1) if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
{ {
await newsInfoServices.Add(new Model.Models.D_NewsInfo() await newsInfoServices.Add(new Model.Models.D_NewsInfo()
{ {
Author = c.author, Author = c.author,
CategoryId = 1, CategoryId = 1,
CreateBy = "spdier", CreateBy = "spdier",
ProvinceCode = "370000", ProvinceCode = p.RegionCode,
CreateId = 1, CreateId = 1,
CreateTime = c.pubtime, CreateTime = c.pubtime,
Detail = c.detail, Detail = c.detail,
@ -68,6 +106,9 @@ namespace New_College.Tasks
{ {
ConsoleHelper.WriteWarningLine($"Job spider 抓取异常"); ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
} }
});
} }
public Task StopAsync(CancellationToken cancellationToken) public Task StopAsync(CancellationToken cancellationToken)

View File

@ -0,0 +1,102 @@
using Aliyun.OSS;
using HtmlAgilityPack;
using Microsoft.AspNetCore.Mvc.RazorPages;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using System.Text.Json;
using New_College.Common.Helper;
using System.Threading;
namespace New_College.Tasks
{
public class NationWideNewsAgHelper
{
private string itemUrl = "https://gaokao.chsi.com.cn";
public List<NewsModels> HtmlCreatePageData(string provinceCode)
{
HtmlWeb webClient = new HtmlWeb();
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
var list = new List<NewsModels>();
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
using var httpClient = new HttpClient();
var response = httpClient.GetAsync(baseUrl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
Console.WriteLine(jsonData);
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData);
if (resultlist.msg.Any())
{
resultlist.msg.ForEach(o =>
{
Thread.Sleep(100);
string newsUrl = $"{itemUrl}{o.uri}";
HtmlDocument doc = webClient.Load(newsUrl);
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText;
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", "");
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
{
string href = linkNode.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
{
string fullUrl = "https://gaokao.chsi.com.cn" + href;
linkNode.SetAttributeValue("href", fullUrl);
}
}
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
{
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
{
innerhtml = $"{innerhtml}\n若有附件详情请至本省招生考试院下载附件!!!";
}
list.Add(new NewsModels()
{
title = inntertitle,
author = author,
pubtime = Convert.ToDateTime(createtime),
detail = innerhtml.Replace("src=\"", "src=\"https://gaokao.chsi.com.cn")
});
}
});
}
}
else
{
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
}
return list;
}
}
}
public class GaokaoObject
{
public List<Msg> msg { get; set; }
public bool flag { get; set; }
}
public class Msg
{
public string title { get; set; }
public string truncTitle { get; set; }
public string uri { get; set; }
public string displayDate { get; set; }
}