bug fixed

develop
old易 2025-05-30 15:31:30 +08:00
parent 86eb0ff747
commit f7d6c89b7a
3 changed files with 196 additions and 64 deletions

View File

@ -36,10 +36,10 @@ namespace New_College.Tasks
private async void DoWork(object state) private async void DoWork(object state)
{ {
// BatchListAgHelper batchListAgHelper = new BatchListAgHelper(); // BatchListAgHelper batchListAgHelper = new BatchListAgHelper();
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper(); NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-")); var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
provineInfo.ForEach(p => provineInfo.ForEach(async p =>
{ {
//var years = new List<string>() { "2024", "2023", "2022" }; //var years = new List<string>() { "2024", "2023", "2022" };
//years.ForEach(y => //years.ForEach(y =>
@ -48,7 +48,7 @@ namespace New_College.Tasks
//}); //});
try try
{ {
var list = anghelper.HtmlCreatePageData(p.RegionCode); var list = await anghelper.HtmlCreatePageData(p.RegionCode);
list.ForEach(async c => list.ForEach(async c =>
{ {
c.title = c.title.Replace($"{p.SimpleName}", ""); c.title = c.title.Replace($"{p.SimpleName}", "");
@ -68,7 +68,7 @@ namespace New_College.Tasks
OrderSort = 0, OrderSort = 0,
IsDelete = false, IsDelete = false,
Title = c.title, Title = c.title,
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail) Summary = c.summary
}); });
} }

View File

@ -54,6 +54,7 @@ namespace New_College.Tasks
{ {
public string title { get; set; } public string title { get; set; }
public string author { get; set; } public string author { get; set; }
public string summary { get; set; }
public DateTime pubtime { get; set; } public DateTime pubtime { get; set; }
public string detail { get; set; } public string detail { get; set; }
} }

View File

@ -18,81 +18,212 @@ namespace New_College.Tasks
{ {
private string itemUrl = "https://gaokao.chsi.com.cn"; private string itemUrl = "https://gaokao.chsi.com.cn";
public List<NewsModels> HtmlCreatePageData(string provinceCode) public async Task<List<NewsModels>> HtmlCreatePageData(string provinceCode)
{ {
HtmlWeb webClient = new HtmlWeb();
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
var list = new List<NewsModels>(); var list = new List<NewsModels>();
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
using var httpClient = new HttpClient(); switch (provinceCode)
var response = httpClient.GetAsync(baseUrl).Result;
if (response.IsSuccessStatusCode)
{ {
var jsonData = response.Content.ReadAsStringAsync().Result; case "310000":
Console.WriteLine(jsonData); list=await ShangHaiNewsList();
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData); break;
if (resultlist.msg.Any()) case "370000":
list= ShanDongNewsList();
break;
}
return list;
}
private async Task<List<NewsModels>> ShangHaiNewsList()
{
string BaseUrl = "https://www.shmeea.edu.cn";
// var web = new HtmlWeb();
var list = new List<NewsModels>();
var listUrl = $"{BaseUrl}/page/02200/index.html";
using var client = new HttpClient();
var bytes = await client.GetByteArrayAsync(listUrl);
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
var listDoc = new HtmlDocument();
listDoc.LoadHtml(html);
// 核心修改class 名模糊匹配 + 节点层级简化
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
if (items == null)
{
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
return list;
}
foreach (var li in items)
{
try
{ {
try var aTag = li.SelectSingleNode("./a");
var span = li.SelectSingleNode("./span[@class='listTime']");
var href = aTag.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
var title = aTag.GetAttributeValue("title", "")?.Trim();
var date = span?.InnerText?.Trim();
// PDF 直接跳过正文抓取
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
{ {
resultlist.msg.ForEach(o => list.Add(new NewsModels
{ {
Thread.Sleep(100); title = title,
string newsUrl = $"{itemUrl}{o.uri}"; author = "上海市教育考试院",
HtmlDocument doc = webClient.Load(newsUrl); pubtime = Convert.ToDateTime(date),
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText; summary = "该条为 PDF 附件,请点击查看。",
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", ""); detail = $"<p><a href=\"{fullUrl}\" target=\"_blank\">点击下载附件</a></p>"
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
{
string href = linkNode.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
{
string fullUrl = "https://gaokao.chsi.com.cn" + href;
linkNode.SetAttributeValue("href", fullUrl);
}
}
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
//if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
//{
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
{
innerhtml = $"{innerhtml}\n若有附件详情请至本省招生考试院下载附件!!!";
}
list.Add(new NewsModels()
{
title = inntertitle,
author = author,
pubtime = Convert.ToDateTime(createtime),
detail = Regex.Replace(innerhtml, @"src=""(?!https?:\/\/)(.*?)""", @"src=""https://gaokao.chsi.com.cn$1""")
});
// }
}); });
continue;
} }
catch (Exception ex)
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
// 加载详情页:同样用 HttpClient + UTF-8 解码
var detailBytes = await client.GetByteArrayAsync(fullUrl);
var detailHtml = Encoding.UTF8.GetString(detailBytes);
var detailDoc = new HtmlDocument();
detailDoc.LoadHtml(detailHtml);
var model = new NewsModels
{ {
Console.WriteLine(ex.Message); author = "上海市教育考试院",
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
};
// 标题(剔除 small
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
if (titleNode != null)
{
var small = titleNode.SelectSingleNode("./small");
if (small != null) small.Remove();
model.title = titleNode.InnerText.Trim();
} }
// 正文
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
// 附件追加到正文
var links = contentNode?.SelectNodes(".//a[@href]");
if (links != null)
{
foreach (var link in links)
{
var hrefVal = link.GetAttributeValue("href", "");
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
{
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
model.detail += $"<p>附件:<a href=\"{fullLink}\" target=\"_blank\">{link.InnerText}</a></p>";
}
}
}
// 摘要
var plainText = Regex.Replace(model.detail, "<.*?>", "");
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
list.Add(model);
Thread.Sleep(200); // 防止请求过快被封
}
catch (Exception ex)
{
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
} }
} }
else
{
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
}
return list; return list;
} }
private List<NewsModels> ShanDongNewsList()
{
string BaseUrl = "https://www.sdzk.cn/";
var web = new HtmlWeb();
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
var doc = web.Load(listUrl);
var newsList = new List<NewsModels>();
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
if (listItems == null) return newsList;
foreach (var item in listItems)
{
try
{
// 基础字段
var href = item.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
var title = item.InnerText?.Trim();
// 发布时间解析
var dateMatch = Regex.Match(item.InnerHtml, @"<i>(\d{4}-\d{2}-\d{2})<\/i>");
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
// 加载详情页
var detailDoc = web.Load(fullUrl);
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
// 提取作者
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
string author = "未知作者";
if (infoSpans != null)
{
foreach (var span in infoSpans)
{
var text = span.InnerText.Trim();
if (text.StartsWith("作者:"))
{
author = text.Replace("作者:", "").Trim();
break;
}
}
}
// 封面图
string coverImg = "";
var firstImg = contentNode?.SelectSingleNode(".//img");
if (firstImg != null)
{
var src = firstImg.GetAttributeValue("src", "");
if (!string.IsNullOrEmpty(src))
{
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
}
}
// 摘要
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
newsList.Add(new NewsModels
{
title = title,
author = author,
summary = summary,
detail = detailHtml,
pubtime = DateTime.Parse(pubtime)
});
Thread.Sleep(200); // 限速
}
catch (Exception ex)
{
Console.WriteLine($"处理失败:{ex.Message}");
}
}
return newsList;
}
} }
} }