using HtmlAgilityPack; using OpenQA.Selenium.Chrome; using OpenQA.Selenium; using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace New_Spider.阳光高考 { public class HtmlAgNewsHelper { public void HtmlCreatePageData() { HtmlWeb webClient = new HtmlWeb(); HtmlDocument doc = webClient.Load("https://www.sdzk.cn/NewsList.aspx?BCID=2"); HtmlNodeCollection categoryNodeList = doc.DocumentNode.SelectNodes("//*[@id=\"ctl00_ContentPlaceHolder1_ctl00_ContentPlaceHolder1_RadListView1Panel\"]/ul/li"); var listurls = new List(); foreach (var item in categoryNodeList)// { var aa = item.ChildNodes[0]; Console.WriteLine(string.Format("https://www.sdzk.cn/{0}|{1}", item.ChildNodes[0].Attributes["href"].Value, item.InnerText)); listurls.Add(string.Format("https://www.sdzk.cn/{0}", item.ChildNodes[0].Attributes["href"].Value)); } listurls.ForEach(url => { var doc = webClient.Load(url); var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/h3").InnerText; var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("作者:")[1].Split(" ")[0]; var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("发布时间:")[1]; var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/div").InnerHtml; }); } public void HtmlSpiderYangGuangData() { // 初始化 Chrome 驱动 var options = new ChromeOptions(); // 如果需要,添加无头模式选项 options.AddArgument("--headless"); options.AddArgument("--no-sandbox"); options.AddArgument("--disable-dev-shm-usage"); string driverExecutableFileName = "chromedriver"; string driverPath = "D:\\一草一木教育\\newgaokao\\New_Spider\\bin\\Debug\\net6.0\\"; ChromeDriverService service = ChromeDriverService.CreateDefaultService(driverPath, driverExecutableFileName); var driver = new ChromeDriver(service, options); // 导航到目标网页 driver.Navigate().GoToUrl("https://gaokao.chsi.com.cn/news/zszc.do"); // 等待页面加载完成,包括动态内容 Thread.Sleep(1000); // 这里使用简单的等待,更好的做法是使用 WebDriverWait // 获取动态内容 var content = driver.FindElement(By.XPath("//*[@id=\"app\"]/div[2]/div[2]")); // 也可以进一步获取子节点的子节点 IReadOnlyCollection grandChildElements = driver.FindElements(By.XPath("//*[@id=\"app\"]/div[2]/div[2]/div[*]")); foreach (IWebElement grandChildElement in grandChildElements) { var grandChild = grandChildElement.FindElement(By.TagName("span")); var grandChildText = grandChild.Text; var titletext = grandChildElement.FindElement(By.TagName("a")).Text; var aurl = grandChildElement.FindElement(By.TagName("a")).GetAttribute("href"); Console.WriteLine($"Grandchild Element Text: {grandChildText}"); } // 关闭浏览器驱动 driver.Quit(); // 打印动态内容 Console.WriteLine(content); } } }