243 lines
9.4 KiB
C#
243 lines
9.4 KiB
C#
using Aliyun.OSS;
|
||
using HtmlAgilityPack;
|
||
using Microsoft.AspNetCore.Mvc.RazorPages;
|
||
using System;
|
||
using System.Collections.Generic;
|
||
using System.IO;
|
||
using System.Linq;
|
||
using System.Net.Http;
|
||
using System.Text;
|
||
using System.Threading.Tasks;
|
||
using System.Text.Json;
|
||
using New_College.Common.Helper;
|
||
using System.Threading;
|
||
using System.Text.RegularExpressions;
|
||
namespace New_College.Tasks
|
||
{
|
||
public class NationWideNewsAgHelper
|
||
{
|
||
|
||
private string itemUrl = "https://gaokao.chsi.com.cn";
|
||
public async Task<List<NewsModels>> HtmlCreatePageData(string provinceCode)
|
||
{
|
||
var list = new List<NewsModels>();
|
||
|
||
switch (provinceCode)
|
||
{
|
||
case "310000":
|
||
list=await ShangHaiNewsList();
|
||
break;
|
||
case "370000":
|
||
list= ShanDongNewsList();
|
||
break;
|
||
|
||
}
|
||
return list;
|
||
}
|
||
|
||
|
||
private async Task<List<NewsModels>> ShangHaiNewsList()
|
||
{
|
||
string BaseUrl = "https://www.shmeea.edu.cn";
|
||
// var web = new HtmlWeb();
|
||
var list = new List<NewsModels>();
|
||
var listUrl = $"{BaseUrl}/page/02200/index.html";
|
||
using var client = new HttpClient();
|
||
var bytes = await client.GetByteArrayAsync(listUrl);
|
||
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
|
||
|
||
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||
var listDoc = new HtmlDocument();
|
||
listDoc.LoadHtml(html);
|
||
|
||
// 核心修改:class 名模糊匹配 + 节点层级简化
|
||
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
|
||
if (items == null)
|
||
{
|
||
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
|
||
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
|
||
return list;
|
||
}
|
||
foreach (var li in items)
|
||
{
|
||
try
|
||
{
|
||
var aTag = li.SelectSingleNode("./a");
|
||
var span = li.SelectSingleNode("./span[@class='listTime']");
|
||
|
||
var href = aTag.GetAttributeValue("href", "");
|
||
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
|
||
var title = aTag.GetAttributeValue("title", "")?.Trim();
|
||
var date = span?.InnerText?.Trim();
|
||
|
||
// PDF 直接跳过正文抓取
|
||
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
list.Add(new NewsModels
|
||
{
|
||
title = title,
|
||
author = "上海市教育考试院",
|
||
pubtime = Convert.ToDateTime(date),
|
||
summary = "该条为 PDF 附件,请点击查看。",
|
||
detail = $"<p><a href=\"{fullUrl}\" target=\"_blank\">点击下载附件</a></p>"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||
|
||
// 加载详情页:同样用 HttpClient + UTF-8 解码
|
||
var detailBytes = await client.GetByteArrayAsync(fullUrl);
|
||
var detailHtml = Encoding.UTF8.GetString(detailBytes);
|
||
var detailDoc = new HtmlDocument();
|
||
detailDoc.LoadHtml(detailHtml);
|
||
|
||
var model = new NewsModels
|
||
{
|
||
author = "上海市教育考试院",
|
||
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
|
||
};
|
||
|
||
|
||
// 标题(剔除 small)
|
||
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
|
||
if (titleNode != null)
|
||
{
|
||
var small = titleNode.SelectSingleNode("./small");
|
||
if (small != null) small.Remove();
|
||
model.title = titleNode.InnerText.Trim();
|
||
}
|
||
|
||
// 正文
|
||
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
|
||
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
|
||
|
||
|
||
|
||
// 附件追加到正文
|
||
var links = contentNode?.SelectNodes(".//a[@href]");
|
||
if (links != null)
|
||
{
|
||
foreach (var link in links)
|
||
{
|
||
var hrefVal = link.GetAttributeValue("href", "");
|
||
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
|
||
{
|
||
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
|
||
model.detail += $"<p>附件:<a href=\"{fullLink}\" target=\"_blank\">{link.InnerText}</a></p>";
|
||
}
|
||
}
|
||
}
|
||
|
||
// 摘要
|
||
var plainText = Regex.Replace(model.detail, "<.*?>", "");
|
||
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||
list.Add(model);
|
||
|
||
Thread.Sleep(200); // 防止请求过快被封
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
|
||
}
|
||
}
|
||
|
||
return list;
|
||
}
|
||
|
||
private List<NewsModels> ShanDongNewsList()
|
||
{
|
||
string BaseUrl = "https://www.sdzk.cn/";
|
||
|
||
var web = new HtmlWeb();
|
||
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
|
||
var doc = web.Load(listUrl);
|
||
var newsList = new List<NewsModels>();
|
||
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
|
||
if (listItems == null) return newsList;
|
||
foreach (var item in listItems)
|
||
{
|
||
try
|
||
{
|
||
// 基础字段
|
||
var href = item.GetAttributeValue("href", "");
|
||
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
|
||
var title = item.InnerText?.Trim();
|
||
|
||
// 发布时间解析
|
||
var dateMatch = Regex.Match(item.InnerHtml, @"<i>(\d{4}-\d{2}-\d{2})<\/i>");
|
||
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
|
||
|
||
// 加载详情页
|
||
var detailDoc = web.Load(fullUrl);
|
||
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
|
||
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
|
||
|
||
// 提取作者
|
||
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
|
||
string author = "未知作者";
|
||
if (infoSpans != null)
|
||
{
|
||
foreach (var span in infoSpans)
|
||
{
|
||
var text = span.InnerText.Trim();
|
||
if (text.StartsWith("作者:"))
|
||
{
|
||
author = text.Replace("作者:", "").Trim();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
// 封面图
|
||
string coverImg = "";
|
||
var firstImg = contentNode?.SelectSingleNode(".//img");
|
||
if (firstImg != null)
|
||
{
|
||
var src = firstImg.GetAttributeValue("src", "");
|
||
if (!string.IsNullOrEmpty(src))
|
||
{
|
||
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
|
||
}
|
||
}
|
||
// 摘要
|
||
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
|
||
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||
|
||
newsList.Add(new NewsModels
|
||
{
|
||
title = title,
|
||
author = author,
|
||
summary = summary,
|
||
detail = detailHtml,
|
||
pubtime = DateTime.Parse(pubtime)
|
||
});
|
||
|
||
Thread.Sleep(200); // 限速
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
Console.WriteLine($"处理失败:{ex.Message}");
|
||
}
|
||
}
|
||
|
||
return newsList;
|
||
|
||
}
|
||
|
||
}
|
||
}
|
||
|
||
public class GaokaoObject
|
||
{
|
||
public List<Msg> msg { get; set; }
|
||
public bool flag { get; set; }
|
||
}
|
||
|
||
public class Msg
|
||
{
|
||
public string title { get; set; }
|
||
public string truncTitle { get; set; }
|
||
public string uri { get; set; }
|
||
public string displayDate { get; set; }
|
||
}
|