using Aliyun.OSS;
using HtmlAgilityPack;
using Microsoft.AspNetCore.Mvc.RazorPages;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using System.Text.Json;
using New_College.Common.Helper;
using System.Threading;
using System.Text.RegularExpressions;
namespace New_College.Tasks
{
public class NationWideNewsAgHelper
{
private string itemUrl = "https://gaokao.chsi.com.cn";
public async Task> HtmlCreatePageData(string provinceCode)
{
var list = new List();
switch (provinceCode)
{
case "310000":
list=await ShangHaiNewsList();
break;
case "370000":
list= ShanDongNewsList();
break;
}
return list;
}
private async Task> ShangHaiNewsList()
{
string BaseUrl = "https://www.shmeea.edu.cn";
// var web = new HtmlWeb();
var list = new List();
var listUrl = $"{BaseUrl}/page/02200/index.html";
using var client = new HttpClient();
var bytes = await client.GetByteArrayAsync(listUrl);
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
var listDoc = new HtmlDocument();
listDoc.LoadHtml(html);
// 核心修改:class 名模糊匹配 + 节点层级简化
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
if (items == null)
{
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
return list;
}
foreach (var li in items)
{
try
{
var aTag = li.SelectSingleNode("./a");
var span = li.SelectSingleNode("./span[@class='listTime']");
var href = aTag.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
var title = aTag.GetAttributeValue("title", "")?.Trim();
var date = span?.InnerText?.Trim();
// PDF 直接跳过正文抓取
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
{
list.Add(new NewsModels
{
title = title,
author = "上海市教育考试院",
pubtime = Convert.ToDateTime(date),
summary = "该条为 PDF 附件,请点击查看。",
detail = $"点击下载附件
"
});
continue;
}
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
// 加载详情页:同样用 HttpClient + UTF-8 解码
var detailBytes = await client.GetByteArrayAsync(fullUrl);
var detailHtml = Encoding.UTF8.GetString(detailBytes);
var detailDoc = new HtmlDocument();
detailDoc.LoadHtml(detailHtml);
var model = new NewsModels
{
author = "上海市教育考试院",
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
};
// 标题(剔除 small)
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
if (titleNode != null)
{
var small = titleNode.SelectSingleNode("./small");
if (small != null) small.Remove();
model.title = titleNode.InnerText.Trim();
}
// 正文
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
// 附件追加到正文
var links = contentNode?.SelectNodes(".//a[@href]");
if (links != null)
{
foreach (var link in links)
{
var hrefVal = link.GetAttributeValue("href", "");
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
{
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
model.detail += $"附件:{link.InnerText}
";
}
}
}
// 摘要
var plainText = Regex.Replace(model.detail, "<.*?>", "");
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
list.Add(model);
Thread.Sleep(200); // 防止请求过快被封
}
catch (Exception ex)
{
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
}
}
return list;
}
private List ShanDongNewsList()
{
string BaseUrl = "https://www.sdzk.cn/";
var web = new HtmlWeb();
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
var doc = web.Load(listUrl);
var newsList = new List();
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
if (listItems == null) return newsList;
foreach (var item in listItems)
{
try
{
// 基础字段
var href = item.GetAttributeValue("href", "");
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
var title = item.InnerText?.Trim();
// 发布时间解析
var dateMatch = Regex.Match(item.InnerHtml, @"(\d{4}-\d{2}-\d{2})<\/i>");
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
// 加载详情页
var detailDoc = web.Load(fullUrl);
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
// 提取作者
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
string author = "未知作者";
if (infoSpans != null)
{
foreach (var span in infoSpans)
{
var text = span.InnerText.Trim();
if (text.StartsWith("作者:"))
{
author = text.Replace("作者:", "").Trim();
break;
}
}
}
// 封面图
string coverImg = "";
var firstImg = contentNode?.SelectSingleNode(".//img");
if (firstImg != null)
{
var src = firstImg.GetAttributeValue("src", "");
if (!string.IsNullOrEmpty(src))
{
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
}
}
// 摘要
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
newsList.Add(new NewsModels
{
title = title,
author = author,
summary = summary,
detail = detailHtml,
pubtime = DateTime.Parse(pubtime)
});
Thread.Sleep(200); // 限速
}
catch (Exception ex)
{
Console.WriteLine($"处理失败:{ex.Message}");
}
}
return newsList;
}
}
}
public class GaokaoObject
{
public List msg { get; set; }
public bool flag { get; set; }
}
public class Msg
{
public string title { get; set; }
public string truncTitle { get; set; }
public string uri { get; set; }
public string displayDate { get; set; }
}