全国新闻资讯采集模块完善
parent
75e2ba93cd
commit
55a3a10528
|
|
@ -17,10 +17,12 @@ namespace New_College.Tasks
|
||||||
///
|
///
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private ID_NewsInfoServices newsInfoServices;
|
private ID_NewsInfoServices newsInfoServices;
|
||||||
|
private ISysRegionServices sysRegionServices;
|
||||||
// 这里可以注入
|
// 这里可以注入
|
||||||
public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices)
|
public JobTimedSpiderService(ID_NewsInfoServices d_NewsInfoServices, ISysRegionServices sysRegionServices)
|
||||||
{
|
{
|
||||||
newsInfoServices = d_NewsInfoServices;
|
newsInfoServices = d_NewsInfoServices;
|
||||||
|
this.sysRegionServices = sysRegionServices;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Task StartAsync(CancellationToken cancellationToken)
|
public Task StartAsync(CancellationToken cancellationToken)
|
||||||
|
|
@ -32,42 +34,81 @@ namespace New_College.Tasks
|
||||||
return Task.CompletedTask;
|
return Task.CompletedTask;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void DoWork(object state)
|
private async void DoWork(object state)
|
||||||
{
|
{
|
||||||
try
|
//try
|
||||||
|
//{
|
||||||
|
// HtmlAgHelper agHelper = new HtmlAgHelper();
|
||||||
|
// var list = agHelper.HtmlCreatePageData();
|
||||||
|
// list.ForEach(async c =>
|
||||||
|
// {
|
||||||
|
// var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
||||||
|
// if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
||||||
|
// {
|
||||||
|
|
||||||
|
// await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
||||||
|
// {
|
||||||
|
// Author = c.author,
|
||||||
|
// CategoryId = 1,
|
||||||
|
// CreateBy = "spdier",
|
||||||
|
// ProvinceCode = "370000",
|
||||||
|
// CreateId = 1,
|
||||||
|
// CreateTime = c.pubtime,
|
||||||
|
// Detail = c.detail,
|
||||||
|
// CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
||||||
|
// OrderSort = 0,
|
||||||
|
// IsDelete = false,
|
||||||
|
// Title = c.title,
|
||||||
|
// Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||||
|
// });
|
||||||
|
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
// ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
||||||
|
//}
|
||||||
|
//catch (Exception ex)
|
||||||
|
//{
|
||||||
|
// ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
||||||
|
//}
|
||||||
|
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
|
||||||
|
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
|
||||||
|
provineInfo.ForEach(p =>
|
||||||
{
|
{
|
||||||
HtmlAgHelper agHelper = new HtmlAgHelper();
|
try
|
||||||
var list = agHelper.HtmlCreatePageData();
|
|
||||||
list.ForEach(async c =>
|
|
||||||
{
|
{
|
||||||
var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
var list = anghelper.HtmlCreatePageData(p.RegionCode);
|
||||||
if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
list.ForEach(async c =>
|
||||||
{
|
{
|
||||||
|
var newsinfo = await newsInfoServices.Query(e => e.Title.Trim() == c.title);
|
||||||
await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
if (!newsinfo.Any() && c.pubtime.Year > DateTime.Now.Year - 1)
|
||||||
{
|
{
|
||||||
Author = c.author,
|
await newsInfoServices.Add(new Model.Models.D_NewsInfo()
|
||||||
CategoryId = 1,
|
{
|
||||||
CreateBy = "spdier",
|
Author = c.author,
|
||||||
ProvinceCode = "370000",
|
CategoryId = 1,
|
||||||
CreateId = 1,
|
CreateBy = "spdier",
|
||||||
CreateTime = c.pubtime,
|
ProvinceCode = p.RegionCode,
|
||||||
Detail = c.detail,
|
CreateId = 1,
|
||||||
CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
CreateTime = c.pubtime,
|
||||||
OrderSort = 0,
|
Detail = c.detail,
|
||||||
IsDelete = false,
|
CoverImg = "https://static-data.ycymedu.com/static/newstop.png",
|
||||||
Title = c.title,
|
OrderSort = 0,
|
||||||
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
IsDelete = false,
|
||||||
});
|
Title = c.title,
|
||||||
|
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
});
|
|
||||||
ConsoleHelper.WriteWarningLine($"Job spider success: {DateTime.Now}-{list.Count}");
|
|
||||||
}
|
|
||||||
catch (Exception ex)
|
|
||||||
{
|
|
||||||
ConsoleHelper.WriteWarningLine($"Job spider 抓取异常");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Task StopAsync(CancellationToken cancellationToken)
|
public Task StopAsync(CancellationToken cancellationToken)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,102 @@
|
||||||
|
using Aliyun.OSS;
|
||||||
|
using HtmlAgilityPack;
|
||||||
|
using Microsoft.AspNetCore.Mvc.RazorPages;
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Net.Http;
|
||||||
|
using System.Text;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using System.Text.Json;
|
||||||
|
using New_College.Common.Helper;
|
||||||
|
using System.Threading;
|
||||||
|
namespace New_College.Tasks
|
||||||
|
{
|
||||||
|
public class NationWideNewsAgHelper
|
||||||
|
{
|
||||||
|
|
||||||
|
private string itemUrl = "https://gaokao.chsi.com.cn";
|
||||||
|
public List<NewsModels> HtmlCreatePageData(string provinceCode)
|
||||||
|
{
|
||||||
|
HtmlWeb webClient = new HtmlWeb();
|
||||||
|
|
||||||
|
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
|
||||||
|
var list = new List<NewsModels>();
|
||||||
|
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
|
||||||
|
using var httpClient = new HttpClient();
|
||||||
|
var response = httpClient.GetAsync(baseUrl).Result;
|
||||||
|
if (response.IsSuccessStatusCode)
|
||||||
|
{
|
||||||
|
var jsonData = response.Content.ReadAsStringAsync().Result;
|
||||||
|
Console.WriteLine(jsonData);
|
||||||
|
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData);
|
||||||
|
if (resultlist.msg.Any())
|
||||||
|
{
|
||||||
|
resultlist.msg.ForEach(o =>
|
||||||
|
{
|
||||||
|
Thread.Sleep(100);
|
||||||
|
string newsUrl = $"{itemUrl}{o.uri}";
|
||||||
|
HtmlDocument doc = webClient.Load(newsUrl);
|
||||||
|
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText;
|
||||||
|
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", "");
|
||||||
|
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
|
||||||
|
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
|
||||||
|
{
|
||||||
|
string href = linkNode.GetAttributeValue("href", "");
|
||||||
|
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
|
||||||
|
{
|
||||||
|
string fullUrl = "https://gaokao.chsi.com.cn" + href;
|
||||||
|
linkNode.SetAttributeValue("href", fullUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
|
||||||
|
if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
|
||||||
|
{
|
||||||
|
|
||||||
|
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
|
||||||
|
{
|
||||||
|
innerhtml = $"{innerhtml}\n若有附件详情,请至本省招生考试院下载附件!!!";
|
||||||
|
}
|
||||||
|
|
||||||
|
list.Add(new NewsModels()
|
||||||
|
{
|
||||||
|
title = inntertitle,
|
||||||
|
author = author,
|
||||||
|
pubtime = Convert.ToDateTime(createtime),
|
||||||
|
detail = innerhtml.Replace("src=\"", "src=\"https://gaokao.chsi.com.cn")
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class GaokaoObject
|
||||||
|
{
|
||||||
|
public List<Msg> msg { get; set; }
|
||||||
|
public bool flag { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Msg
|
||||||
|
{
|
||||||
|
public string title { get; set; }
|
||||||
|
public string truncTitle { get; set; }
|
||||||
|
public string uri { get; set; }
|
||||||
|
public string displayDate { get; set; }
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue