NewGaoKaoApi/New_College.Tasks/HtmlAgSpider/NationWideNewsAgHelper.cs

243 lines
9.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

using Aliyun.OSS;
using HtmlAgilityPack;
using Microsoft.AspNetCore.Mvc.RazorPages;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using System.Text.Json;
using New_College.Common.Helper;
using System.Threading;
using System.Text.RegularExpressions;
namespace New_College.Tasks
{
public class NationWideNewsAgHelper
{
private string itemUrl = "https://gaokao.chsi.com.cn";
public async Task<List<NewsModels>> HtmlCreatePageData(string provinceCode)
{
var list = new List<NewsModels>();
switch (provinceCode)
{
case "310000":
list=await ShangHaiNewsList();
break;
case "370000":
list= ShanDongNewsList();
break;
}
return list;
}
private async Task<List<NewsModels>> ShangHaiNewsList()
{
string BaseUrl = "https://www.shmeea.edu.cn";
// var web = new HtmlWeb();
var list = new List<NewsModels>();
var listUrl = $"{BaseUrl}/page/02200/index.html";
using var client = new HttpClient();
var bytes = await client.GetByteArrayAsync(listUrl);
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
var listDoc = new HtmlDocument();
listDoc.LoadHtml(html);
// 核心修改class 名模糊匹配 + 节点层级简化
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
if (items == null)
{
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
return list;
}
foreach (var li in items)
{
try
{
var aTag = li.SelectSingleNode("./a");
var span = li.SelectSingleNode("./span[@class='listTime']");
var href = aTag.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
var title = aTag.GetAttributeValue("title", "")?.Trim();
var date = span?.InnerText?.Trim();
// PDF 直接跳过正文抓取
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
{
list.Add(new NewsModels
{
title = title,
author = "上海市教育考试院",
pubtime = Convert.ToDateTime(date),
summary = "该条为 PDF 附件,请点击查看。",
detail = $"<p><a href=\"{fullUrl}\" target=\"_blank\">点击下载附件</a></p>"
});
continue;
}
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
// 加载详情页:同样用 HttpClient + UTF-8 解码
var detailBytes = await client.GetByteArrayAsync(fullUrl);
var detailHtml = Encoding.UTF8.GetString(detailBytes);
var detailDoc = new HtmlDocument();
detailDoc.LoadHtml(detailHtml);
var model = new NewsModels
{
author = "上海市教育考试院",
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
};
// 标题(剔除 small
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
if (titleNode != null)
{
var small = titleNode.SelectSingleNode("./small");
if (small != null) small.Remove();
model.title = titleNode.InnerText.Trim();
}
// 正文
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
// 附件追加到正文
var links = contentNode?.SelectNodes(".//a[@href]");
if (links != null)
{
foreach (var link in links)
{
var hrefVal = link.GetAttributeValue("href", "");
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
{
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
model.detail += $"<p>附件:<a href=\"{fullLink}\" target=\"_blank\">{link.InnerText}</a></p>";
}
}
}
// 摘要
var plainText = Regex.Replace(model.detail, "<.*?>", "");
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
list.Add(model);
Thread.Sleep(200); // 防止请求过快被封
}
catch (Exception ex)
{
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
}
}
return list;
}
private List<NewsModels> ShanDongNewsList()
{
string BaseUrl = "https://www.sdzk.cn/";
var web = new HtmlWeb();
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
var doc = web.Load(listUrl);
var newsList = new List<NewsModels>();
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
if (listItems == null) return newsList;
foreach (var item in listItems)
{
try
{
// 基础字段
var href = item.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
var title = item.InnerText?.Trim();
// 发布时间解析
var dateMatch = Regex.Match(item.InnerHtml, @"<i>(\d{4}-\d{2}-\d{2})<\/i>");
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
// 加载详情页
var detailDoc = web.Load(fullUrl);
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
// 提取作者
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
string author = "未知作者";
if (infoSpans != null)
{
foreach (var span in infoSpans)
{
var text = span.InnerText.Trim();
if (text.StartsWith("作者:"))
{
author = text.Replace("作者:", "").Trim();
break;
}
}
}
// 封面图
string coverImg = "";
var firstImg = contentNode?.SelectSingleNode(".//img");
if (firstImg != null)
{
var src = firstImg.GetAttributeValue("src", "");
if (!string.IsNullOrEmpty(src))
{
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
}
}
// 摘要
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
newsList.Add(new NewsModels
{
title = title,
author = author,
summary = summary,
detail = detailHtml,
pubtime = DateTime.Parse(pubtime)
});
Thread.Sleep(200); // 限速
}
catch (Exception ex)
{
Console.WriteLine($"处理失败:{ex.Message}");
}
}
return newsList;
}
}
}
public class GaokaoObject
{
public List<Msg> msg { get; set; }
public bool flag { get; set; }
}
public class Msg
{
public string title { get; set; }
public string truncTitle { get; set; }
public string uri { get; set; }
public string displayDate { get; set; }
}