bug fixed
parent
86eb0ff747
commit
f7d6c89b7a
|
|
@ -36,10 +36,10 @@ namespace New_College.Tasks
|
|||
|
||||
private async void DoWork(object state)
|
||||
{
|
||||
// BatchListAgHelper batchListAgHelper = new BatchListAgHelper();
|
||||
// BatchListAgHelper batchListAgHelper = new BatchListAgHelper();
|
||||
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
|
||||
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
|
||||
provineInfo.ForEach(p =>
|
||||
provineInfo.ForEach(async p =>
|
||||
{
|
||||
//var years = new List<string>() { "2024", "2023", "2022" };
|
||||
//years.ForEach(y =>
|
||||
|
|
@ -48,7 +48,7 @@ namespace New_College.Tasks
|
|||
//});
|
||||
try
|
||||
{
|
||||
var list = anghelper.HtmlCreatePageData(p.RegionCode);
|
||||
var list = await anghelper.HtmlCreatePageData(p.RegionCode);
|
||||
list.ForEach(async c =>
|
||||
{
|
||||
c.title = c.title.Replace($"{p.SimpleName}:", "");
|
||||
|
|
@ -68,7 +68,7 @@ namespace New_College.Tasks
|
|||
OrderSort = 0,
|
||||
IsDelete = false,
|
||||
Title = c.title,
|
||||
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
||||
Summary = c.summary
|
||||
});
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ namespace New_College.Tasks
|
|||
{
|
||||
public string title { get; set; }
|
||||
public string author { get; set; }
|
||||
public string summary { get; set; }
|
||||
public DateTime pubtime { get; set; }
|
||||
public string detail { get; set; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,81 +18,212 @@ namespace New_College.Tasks
|
|||
{
|
||||
|
||||
private string itemUrl = "https://gaokao.chsi.com.cn";
|
||||
public List<NewsModels> HtmlCreatePageData(string provinceCode)
|
||||
public async Task<List<NewsModels>> HtmlCreatePageData(string provinceCode)
|
||||
{
|
||||
|
||||
HtmlWeb webClient = new HtmlWeb();
|
||||
|
||||
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
|
||||
var list = new List<NewsModels>();
|
||||
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
|
||||
using var httpClient = new HttpClient();
|
||||
var response = httpClient.GetAsync(baseUrl).Result;
|
||||
if (response.IsSuccessStatusCode)
|
||||
|
||||
switch (provinceCode)
|
||||
{
|
||||
var jsonData = response.Content.ReadAsStringAsync().Result;
|
||||
Console.WriteLine(jsonData);
|
||||
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData);
|
||||
if (resultlist.msg.Any())
|
||||
case "310000":
|
||||
list=await ShangHaiNewsList();
|
||||
break;
|
||||
case "370000":
|
||||
list= ShanDongNewsList();
|
||||
break;
|
||||
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
private async Task<List<NewsModels>> ShangHaiNewsList()
|
||||
{
|
||||
string BaseUrl = "https://www.shmeea.edu.cn";
|
||||
// var web = new HtmlWeb();
|
||||
var list = new List<NewsModels>();
|
||||
var listUrl = $"{BaseUrl}/page/02200/index.html";
|
||||
using var client = new HttpClient();
|
||||
var bytes = await client.GetByteArrayAsync(listUrl);
|
||||
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
|
||||
|
||||
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||||
var listDoc = new HtmlDocument();
|
||||
listDoc.LoadHtml(html);
|
||||
|
||||
// 核心修改:class 名模糊匹配 + 节点层级简化
|
||||
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
|
||||
if (items == null)
|
||||
{
|
||||
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
|
||||
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
|
||||
return list;
|
||||
}
|
||||
foreach (var li in items)
|
||||
{
|
||||
try
|
||||
{
|
||||
try
|
||||
var aTag = li.SelectSingleNode("./a");
|
||||
var span = li.SelectSingleNode("./span[@class='listTime']");
|
||||
|
||||
var href = aTag.GetAttributeValue("href", "");
|
||||
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
|
||||
var title = aTag.GetAttributeValue("title", "")?.Trim();
|
||||
var date = span?.InnerText?.Trim();
|
||||
|
||||
// PDF 直接跳过正文抓取
|
||||
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
resultlist.msg.ForEach(o =>
|
||||
list.Add(new NewsModels
|
||||
{
|
||||
Thread.Sleep(100);
|
||||
string newsUrl = $"{itemUrl}{o.uri}";
|
||||
HtmlDocument doc = webClient.Load(newsUrl);
|
||||
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText;
|
||||
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", "");
|
||||
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
|
||||
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
|
||||
{
|
||||
string href = linkNode.GetAttributeValue("href", "");
|
||||
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
|
||||
{
|
||||
string fullUrl = "https://gaokao.chsi.com.cn" + href;
|
||||
linkNode.SetAttributeValue("href", fullUrl);
|
||||
}
|
||||
}
|
||||
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
|
||||
//if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
|
||||
//{
|
||||
|
||||
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
|
||||
{
|
||||
innerhtml = $"{innerhtml}\n若有附件详情,请至本省招生考试院下载附件!!!";
|
||||
}
|
||||
|
||||
list.Add(new NewsModels()
|
||||
{
|
||||
title = inntertitle,
|
||||
author = author,
|
||||
pubtime = Convert.ToDateTime(createtime),
|
||||
detail = Regex.Replace(innerhtml, @"src=""(?!https?:\/\/)(.*?)""", @"src=""https://gaokao.chsi.com.cn$1""")
|
||||
});
|
||||
// }
|
||||
title = title,
|
||||
author = "上海市教育考试院",
|
||||
pubtime = Convert.ToDateTime(date),
|
||||
summary = "该条为 PDF 附件,请点击查看。",
|
||||
detail = $"<p><a href=\"{fullUrl}\" target=\"_blank\">点击下载附件</a></p>"
|
||||
});
|
||||
continue;
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
||||
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||||
|
||||
// 加载详情页:同样用 HttpClient + UTF-8 解码
|
||||
var detailBytes = await client.GetByteArrayAsync(fullUrl);
|
||||
var detailHtml = Encoding.UTF8.GetString(detailBytes);
|
||||
var detailDoc = new HtmlDocument();
|
||||
detailDoc.LoadHtml(detailHtml);
|
||||
|
||||
var model = new NewsModels
|
||||
{
|
||||
Console.WriteLine(ex.Message);
|
||||
author = "上海市教育考试院",
|
||||
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
|
||||
};
|
||||
|
||||
|
||||
// 标题(剔除 small)
|
||||
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
|
||||
if (titleNode != null)
|
||||
{
|
||||
var small = titleNode.SelectSingleNode("./small");
|
||||
if (small != null) small.Remove();
|
||||
model.title = titleNode.InnerText.Trim();
|
||||
}
|
||||
|
||||
// 正文
|
||||
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
|
||||
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
|
||||
|
||||
|
||||
|
||||
// 附件追加到正文
|
||||
var links = contentNode?.SelectNodes(".//a[@href]");
|
||||
if (links != null)
|
||||
{
|
||||
foreach (var link in links)
|
||||
{
|
||||
var hrefVal = link.GetAttributeValue("href", "");
|
||||
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
|
||||
{
|
||||
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
|
||||
model.detail += $"<p>附件:<a href=\"{fullLink}\" target=\"_blank\">{link.InnerText}</a></p>";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 摘要
|
||||
var plainText = Regex.Replace(model.detail, "<.*?>", "");
|
||||
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||||
list.Add(model);
|
||||
|
||||
Thread.Sleep(200); // 防止请求过快被封
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
private List<NewsModels> ShanDongNewsList()
|
||||
{
|
||||
string BaseUrl = "https://www.sdzk.cn/";
|
||||
|
||||
var web = new HtmlWeb();
|
||||
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
|
||||
var doc = web.Load(listUrl);
|
||||
var newsList = new List<NewsModels>();
|
||||
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
|
||||
if (listItems == null) return newsList;
|
||||
foreach (var item in listItems)
|
||||
{
|
||||
try
|
||||
{
|
||||
// 基础字段
|
||||
var href = item.GetAttributeValue("href", "");
|
||||
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
|
||||
var title = item.InnerText?.Trim();
|
||||
|
||||
// 发布时间解析
|
||||
var dateMatch = Regex.Match(item.InnerHtml, @"<i>(\d{4}-\d{2}-\d{2})<\/i>");
|
||||
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
|
||||
|
||||
// 加载详情页
|
||||
var detailDoc = web.Load(fullUrl);
|
||||
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
|
||||
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
|
||||
|
||||
// 提取作者
|
||||
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
|
||||
string author = "未知作者";
|
||||
if (infoSpans != null)
|
||||
{
|
||||
foreach (var span in infoSpans)
|
||||
{
|
||||
var text = span.InnerText.Trim();
|
||||
if (text.StartsWith("作者:"))
|
||||
{
|
||||
author = text.Replace("作者:", "").Trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 封面图
|
||||
string coverImg = "";
|
||||
var firstImg = contentNode?.SelectSingleNode(".//img");
|
||||
if (firstImg != null)
|
||||
{
|
||||
var src = firstImg.GetAttributeValue("src", "");
|
||||
if (!string.IsNullOrEmpty(src))
|
||||
{
|
||||
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
|
||||
}
|
||||
}
|
||||
// 摘要
|
||||
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
|
||||
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||||
|
||||
newsList.Add(new NewsModels
|
||||
{
|
||||
title = title,
|
||||
author = author,
|
||||
summary = summary,
|
||||
detail = detailHtml,
|
||||
pubtime = DateTime.Parse(pubtime)
|
||||
});
|
||||
|
||||
Thread.Sleep(200); // 限速
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"处理失败:{ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
return newsList;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue