using Aliyun.OSS; using HtmlAgilityPack; using Microsoft.AspNetCore.Mvc.RazorPages; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net.Http; using System.Text; using System.Threading.Tasks; using System.Text.Json; using New_College.Common.Helper; using System.Threading; using System.Text.RegularExpressions; namespace New_College.Tasks { public class NationWideNewsAgHelper { private string itemUrl = "https://gaokao.chsi.com.cn"; public async Task> HtmlCreatePageData(string provinceCode) { var list = new List(); switch (provinceCode) { case "310000": list=await ShangHaiNewsList(); break; case "370000": list= ShanDongNewsList(); break; } return list; } private async Task> ShangHaiNewsList() { string BaseUrl = "https://www.shmeea.edu.cn"; // var web = new HtmlWeb(); var list = new List(); var listUrl = $"{BaseUrl}/page/02200/index.html"; using var client = new HttpClient(); var bytes = await client.GetByteArrayAsync(listUrl); var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码 // ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串 var listDoc = new HtmlDocument(); listDoc.LoadHtml(html); // 核心修改:class 名模糊匹配 + 节点层级简化 var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li"); if (items == null) { Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!"); File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用 return list; } foreach (var li in items) { try { var aTag = li.SelectSingleNode("./a"); var span = li.SelectSingleNode("./span[@class='listTime']"); var href = aTag.GetAttributeValue("href", ""); var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}"; var title = aTag.GetAttributeValue("title", "")?.Trim(); var date = span?.InnerText?.Trim(); // PDF 直接跳过正文抓取 if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) { list.Add(new NewsModels { title = title, author = "上海市教育考试院", pubtime = Convert.ToDateTime(date), summary = "该条为 PDF 附件,请点击查看。", detail = $"

点击下载附件

" }); continue; } // ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串 // 加载详情页:同样用 HttpClient + UTF-8 解码 var detailBytes = await client.GetByteArrayAsync(fullUrl); var detailHtml = Encoding.UTF8.GetString(detailBytes); var detailDoc = new HtmlDocument(); detailDoc.LoadHtml(detailHtml); var model = new NewsModels { author = "上海市教育考试院", pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now }; // 标题(剔除 small) var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']"); if (titleNode != null) { var small = titleNode.SelectSingleNode("./small"); if (small != null) small.Remove(); model.title = titleNode.InnerText.Trim(); } // 正文 var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']"); model.detail = contentNode?.InnerHtml?.Trim() ?? ""; // 附件追加到正文 var links = contentNode?.SelectNodes(".//a[@href]"); if (links != null) { foreach (var link in links) { var hrefVal = link.GetAttributeValue("href", ""); if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase)) { var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}"; model.detail += $"

附件:{link.InnerText}

"; } } } // 摘要 var plainText = Regex.Replace(model.detail, "<.*?>", ""); model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText; list.Add(model); Thread.Sleep(200); // 防止请求过快被封 } catch (Exception ex) { Console.WriteLine($"❌ 抓取失败:{ex.Message}"); } } return list; } private List ShanDongNewsList() { string BaseUrl = "https://www.sdzk.cn/"; var web = new HtmlWeb(); var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117"; var doc = web.Load(listUrl); var newsList = new List(); var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a"); if (listItems == null) return newsList; foreach (var item in listItems) { try { // 基础字段 var href = item.GetAttributeValue("href", ""); var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}"; var title = item.InnerText?.Trim(); // 发布时间解析 var dateMatch = Regex.Match(item.InnerHtml, @"(\d{4}-\d{2}-\d{2})<\/i>"); var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间"; // 加载详情页 var detailDoc = web.Load(fullUrl); var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']"); var detailHtml = contentNode?.InnerHtml?.Trim() ?? ""; // 提取作者 var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span"); string author = "未知作者"; if (infoSpans != null) { foreach (var span in infoSpans) { var text = span.InnerText.Trim(); if (text.StartsWith("作者:")) { author = text.Replace("作者:", "").Trim(); break; } } } // 封面图 string coverImg = ""; var firstImg = contentNode?.SelectSingleNode(".//img"); if (firstImg != null) { var src = firstImg.GetAttributeValue("src", ""); if (!string.IsNullOrEmpty(src)) { coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}"; } } // 摘要 var plainText = Regex.Replace(detailHtml, "<.*?>", ""); var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText; newsList.Add(new NewsModels { title = title, author = author, summary = summary, detail = detailHtml, pubtime = DateTime.Parse(pubtime) }); Thread.Sleep(200); // 限速 } catch (Exception ex) { Console.WriteLine($"处理失败:{ex.Message}"); } } return newsList; } } } public class GaokaoObject { public List msg { get; set; } public bool flag { get; set; } } public class Msg { public string title { get; set; } public string truncTitle { get; set; } public string uri { get; set; } public string displayDate { get; set; } }