From f7d6c89b7a5405e2f63dc280a550d2208ddc22ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?old=E6=98=93?= <156663459@qq.com> Date: Fri, 30 May 2025 15:31:30 +0800 Subject: [PATCH] bug fixed --- .../HostedService/JobTimedSpiderService.cs | 8 +- .../HtmlAgSpider/HtmlAgHelper.cs | 1 + .../HtmlAgSpider/NationWideNewsAgHelper.cs | 251 +++++++++++++----- 3 files changed, 196 insertions(+), 64 deletions(-) diff --git a/New_College.Tasks/HostedService/JobTimedSpiderService.cs b/New_College.Tasks/HostedService/JobTimedSpiderService.cs index e835ec9..23cfc00 100644 --- a/New_College.Tasks/HostedService/JobTimedSpiderService.cs +++ b/New_College.Tasks/HostedService/JobTimedSpiderService.cs @@ -36,10 +36,10 @@ namespace New_College.Tasks private async void DoWork(object state) { - // BatchListAgHelper batchListAgHelper = new BatchListAgHelper(); + // BatchListAgHelper batchListAgHelper = new BatchListAgHelper(); NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper(); var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-")); - provineInfo.ForEach(p => + provineInfo.ForEach(async p => { //var years = new List() { "2024", "2023", "2022" }; //years.ForEach(y => @@ -48,7 +48,7 @@ namespace New_College.Tasks //}); try { - var list = anghelper.HtmlCreatePageData(p.RegionCode); + var list = await anghelper.HtmlCreatePageData(p.RegionCode); list.ForEach(async c => { c.title = c.title.Replace($"{p.SimpleName}:", ""); @@ -68,7 +68,7 @@ namespace New_College.Tasks OrderSort = 0, IsDelete = false, Title = c.title, - Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail) + Summary = c.summary }); } diff --git a/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs b/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs index 6db2dbd..7fc82f2 100644 --- a/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs +++ b/New_College.Tasks/HtmlAgSpider/HtmlAgHelper.cs @@ -54,6 +54,7 @@ namespace New_College.Tasks { public string title { get; set; } public string author { get; set; } + public string summary { get; set; } public DateTime pubtime { get; set; } public string detail { get; set; } } diff --git a/New_College.Tasks/HtmlAgSpider/NationWideNewsAgHelper.cs b/New_College.Tasks/HtmlAgSpider/NationWideNewsAgHelper.cs index 5e15b64..5018acb 100644 --- a/New_College.Tasks/HtmlAgSpider/NationWideNewsAgHelper.cs +++ b/New_College.Tasks/HtmlAgSpider/NationWideNewsAgHelper.cs @@ -18,81 +18,212 @@ namespace New_College.Tasks { private string itemUrl = "https://gaokao.chsi.com.cn"; - public List HtmlCreatePageData(string provinceCode) + public async Task> HtmlCreatePageData(string provinceCode) { - - HtmlWeb webClient = new HtmlWeb(); - - var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss="; var list = new List(); - var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}"; - using var httpClient = new HttpClient(); - var response = httpClient.GetAsync(baseUrl).Result; - if (response.IsSuccessStatusCode) + + switch (provinceCode) { - var jsonData = response.Content.ReadAsStringAsync().Result; - Console.WriteLine(jsonData); - var resultlist = JsonSerializer.Deserialize(jsonData); - if (resultlist.msg.Any()) + case "310000": + list=await ShangHaiNewsList(); + break; + case "370000": + list= ShanDongNewsList(); + break; + + } + return list; + } + + + private async Task> ShangHaiNewsList() + { + string BaseUrl = "https://www.shmeea.edu.cn"; + // var web = new HtmlWeb(); + var list = new List(); + var listUrl = $"{BaseUrl}/page/02200/index.html"; + using var client = new HttpClient(); + var bytes = await client.GetByteArrayAsync(listUrl); + var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码 + + // ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串 + var listDoc = new HtmlDocument(); + listDoc.LoadHtml(html); + + // 核心修改:class 名模糊匹配 + 节点层级简化 + var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li"); + if (items == null) + { + Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!"); + File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用 + return list; + } + foreach (var li in items) + { + try { - try + var aTag = li.SelectSingleNode("./a"); + var span = li.SelectSingleNode("./span[@class='listTime']"); + + var href = aTag.GetAttributeValue("href", ""); + var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}"; + var title = aTag.GetAttributeValue("title", "")?.Trim(); + var date = span?.InnerText?.Trim(); + + // PDF 直接跳过正文抓取 + if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) { - resultlist.msg.ForEach(o => + list.Add(new NewsModels { - Thread.Sleep(100); - string newsUrl = $"{itemUrl}{o.uri}"; - HtmlDocument doc = webClient.Load(newsUrl); - var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText; - var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", ""); - var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText; - foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null)) - { - string href = linkNode.GetAttributeValue("href", ""); - if (!string.IsNullOrEmpty(href) && href.StartsWith("/")) - { - string fullUrl = "https://gaokao.chsi.com.cn" + href; - linkNode.SetAttributeValue("href", fullUrl); - } - } - var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml; - //if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器")) - //{ - - if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx")) - { - innerhtml = $"{innerhtml}\n若有附件详情,请至本省招生考试院下载附件!!!"; - } - - list.Add(new NewsModels() - { - title = inntertitle, - author = author, - pubtime = Convert.ToDateTime(createtime), - detail = Regex.Replace(innerhtml, @"src=""(?!https?:\/\/)(.*?)""", @"src=""https://gaokao.chsi.com.cn$1""") - }); - // } + title = title, + author = "上海市教育考试院", + pubtime = Convert.ToDateTime(date), + summary = "该条为 PDF 附件,请点击查看。", + detail = $"

点击下载附件

" }); + continue; } - catch (Exception ex) + + // ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串 + + // 加载详情页:同样用 HttpClient + UTF-8 解码 + var detailBytes = await client.GetByteArrayAsync(fullUrl); + var detailHtml = Encoding.UTF8.GetString(detailBytes); + var detailDoc = new HtmlDocument(); + detailDoc.LoadHtml(detailHtml); + + var model = new NewsModels { - Console.WriteLine(ex.Message); + author = "上海市教育考试院", + pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now + }; + + + // 标题(剔除 small) + var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']"); + if (titleNode != null) + { + var small = titleNode.SelectSingleNode("./small"); + if (small != null) small.Remove(); + model.title = titleNode.InnerText.Trim(); } + + // 正文 + var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']"); + model.detail = contentNode?.InnerHtml?.Trim() ?? ""; + + + + // 附件追加到正文 + var links = contentNode?.SelectNodes(".//a[@href]"); + if (links != null) + { + foreach (var link in links) + { + var hrefVal = link.GetAttributeValue("href", ""); + if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase)) + { + var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}"; + model.detail += $"

附件:{link.InnerText}

"; + } + } + } + + // 摘要 + var plainText = Regex.Replace(model.detail, "<.*?>", ""); + model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText; + list.Add(model); + + Thread.Sleep(200); // 防止请求过快被封 + } + catch (Exception ex) + { + Console.WriteLine($"❌ 抓取失败:{ex.Message}"); } } - else - { - Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}"); - } - - - - - - - return list; } + + private List ShanDongNewsList() + { + string BaseUrl = "https://www.sdzk.cn/"; + + var web = new HtmlWeb(); + var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117"; + var doc = web.Load(listUrl); + var newsList = new List(); + var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a"); + if (listItems == null) return newsList; + foreach (var item in listItems) + { + try + { + // 基础字段 + var href = item.GetAttributeValue("href", ""); + var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}"; + var title = item.InnerText?.Trim(); + + // 发布时间解析 + var dateMatch = Regex.Match(item.InnerHtml, @"(\d{4}-\d{2}-\d{2})<\/i>"); + var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间"; + + // 加载详情页 + var detailDoc = web.Load(fullUrl); + var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']"); + var detailHtml = contentNode?.InnerHtml?.Trim() ?? ""; + + // 提取作者 + var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span"); + string author = "未知作者"; + if (infoSpans != null) + { + foreach (var span in infoSpans) + { + var text = span.InnerText.Trim(); + if (text.StartsWith("作者:")) + { + author = text.Replace("作者:", "").Trim(); + break; + } + } + } + // 封面图 + string coverImg = ""; + var firstImg = contentNode?.SelectSingleNode(".//img"); + if (firstImg != null) + { + var src = firstImg.GetAttributeValue("src", ""); + if (!string.IsNullOrEmpty(src)) + { + coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}"; + } + } + // 摘要 + var plainText = Regex.Replace(detailHtml, "<.*?>", ""); + var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText; + + newsList.Add(new NewsModels + { + title = title, + author = author, + summary = summary, + detail = detailHtml, + pubtime = DateTime.Parse(pubtime) + }); + + Thread.Sleep(200); // 限速 + } + catch (Exception ex) + { + Console.WriteLine($"处理失败:{ex.Message}"); + } + } + + return newsList; + + } + } }