faet:爬虫数据部分提交
parent
51c8c9a526
commit
9edc8a7aa2
|
|
@ -1,4 +1,5 @@
|
|||
using Microsoft.AspNetCore.Mvc;
|
||||
using New_College.Common.Helper;
|
||||
using New_College.IServices;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
|
|
@ -24,6 +25,7 @@ namespace New_College.Controllers
|
|||
[HttpGet]
|
||||
public IActionResult Get()
|
||||
{
|
||||
|
||||
return Ok();
|
||||
}
|
||||
|
||||
|
|
@ -35,7 +37,9 @@ namespace New_College.Controllers
|
|||
public Task<bool> Post()
|
||||
{
|
||||
// return t_EnrollmentPlane.Import();
|
||||
return t_EnrollmentPlane.Importuniverbaseinfo();
|
||||
// return t_EnrollmentPlane.Importuniverbaseinfo();
|
||||
|
||||
return t_EnrollmentPlane.universitydetailupdate();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -527,7 +527,7 @@
|
|||
</member>
|
||||
<member name="P:New_College.Model.Models.D_University.Nature">
|
||||
<summary>
|
||||
办学性质 1、公办,2、民办
|
||||
办学性质 1、公办,2、民办,3中外合作,4 港澳台
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:New_College.Model.Models.D_University.Ascription">
|
||||
|
|
@ -4799,6 +4799,11 @@
|
|||
是否双一流
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:New_College.Model.ViewModels.UniversityResult.QJJH">
|
||||
<summary>
|
||||
强基
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:New_College.Model.ViewModels.UniversityResult.Logo">
|
||||
<summary>
|
||||
学校logo
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
using Newtonsoft.Json;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
|
||||
namespace New_College.Common.Helper
|
||||
{
|
||||
public static class UniversityTypeRelsove
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// 院校类型转换
|
||||
/// </summary>0综合,1理工类,2医学类|3军事类|4语言类|5师范类|6财经类|7政法类|8民族类|9农林类|10艺术类|11体育类|12其他
|
||||
/// <param name="type"></param>
|
||||
/// <returns></returns>
|
||||
public static string GetType(int type)
|
||||
{
|
||||
string typename = string.Empty;
|
||||
switch (type)
|
||||
{
|
||||
case 0:
|
||||
typename = "综合类";
|
||||
break;
|
||||
case 1:
|
||||
typename = "理工类";
|
||||
break;
|
||||
case 2:
|
||||
typename = "医学类";
|
||||
break;
|
||||
case 3:
|
||||
typename = "军事类";
|
||||
break;
|
||||
case 4:
|
||||
typename = "语言类";
|
||||
break;
|
||||
case 5:
|
||||
typename = "师范类";
|
||||
break;
|
||||
case 6:
|
||||
typename = "财经类";
|
||||
break;
|
||||
case 7:
|
||||
typename = "政法类";
|
||||
break;
|
||||
case 8:
|
||||
typename = "民族类";
|
||||
break;
|
||||
case 9:
|
||||
typename = "农林类";
|
||||
break;
|
||||
case 10:
|
||||
typename = "艺术类";
|
||||
break;
|
||||
case 11:
|
||||
typename = "体育类";
|
||||
break;
|
||||
case 12:
|
||||
typename = "其他";
|
||||
break;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
return typename;
|
||||
}
|
||||
|
||||
|
||||
public static string GetFileJson(string filepath)
|
||||
{
|
||||
string json = string.Empty;
|
||||
using (FileStream fs = new FileStream(filepath, FileMode.Open, System.IO.FileAccess.Read, FileShare.ReadWrite))
|
||||
{
|
||||
using (StreamReader sr = new StreamReader(fs, Encoding.GetEncoding("utf-8")))
|
||||
{
|
||||
json = sr.ReadToEnd().ToString();
|
||||
|
||||
// var obj = JsonConvert.DeserializeObject(json);
|
||||
}
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@ namespace New_College.IServices
|
|||
public interface IT_EnrollmentPlaneServices : IBaseServices<T_EnrollmentPlane>
|
||||
{
|
||||
Task<bool> Import();
|
||||
|
||||
Task<bool> universitydetailupdate();
|
||||
Task<bool> Importuniverbaseinfo();
|
||||
/// <summary>
|
||||
/// 根据大学或者专业获取招生计划
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ namespace New_College.Model.Models
|
|||
public string Build_Date { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// 办学性质 1、公办,2、民办
|
||||
/// 办学性质 1、公办,2、民办,3中外合作,4 港澳台
|
||||
/// </summary>
|
||||
[SugarColumn(IsNullable = true)]
|
||||
public int Nature { get; set; }
|
||||
|
|
|
|||
|
|
@ -30,6 +30,11 @@ namespace New_College.Model.ViewModels
|
|||
/// 是否双一流
|
||||
/// </summary>
|
||||
public bool? Syl { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// 强基
|
||||
/// </summary>
|
||||
public bool? QJJH { get; set; }
|
||||
/// <summary>
|
||||
/// 学校logo
|
||||
/// </summary>
|
||||
|
|
|
|||
|
|
@ -304,6 +304,7 @@ namespace New_College.Services
|
|||
Syl = university.Syl == 1 ? true : false,
|
||||
Nhef = university.Nhef == 1 ? true : false,
|
||||
Sff = university.Sff == 1 ? true : false,
|
||||
QJJH = university.QJJH == 1 ? true : false,
|
||||
UniversityType = university.Type,
|
||||
SubjectLevel = university.Subject_Level,
|
||||
Imglist = imgs,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ using System.Threading.Tasks;
|
|||
using System.Collections.Generic;
|
||||
using New_College.IRepository;
|
||||
using System.Linq;
|
||||
using New_College.Common.Helper;
|
||||
|
||||
namespace New_College.Services
|
||||
{
|
||||
|
|
@ -161,14 +162,14 @@ namespace New_College.Services
|
|||
{
|
||||
Id = c.Id,
|
||||
Name = c.Name,
|
||||
Nature = c.Nature == 1 ? "公立" : "私立",
|
||||
Nature = c.Nature == 0 ? "公办" : c.Nature == 1 ? "民办" : c.Nature == 2 ? "中外合作" : "港澳台合作",
|
||||
AscriptionName = string.IsNullOrWhiteSpace(c.AscriptionName) ? "-" : c.AscriptionName,
|
||||
Rank = c.Rank,
|
||||
AreaName = c.Area_Name,
|
||||
Syl = c.Syl == 1 ? "是" : "否",
|
||||
Nhef = c.Nhef == 1 ? "是" : "否",
|
||||
Sff = c.Sff == 1 ? "是" : "否",
|
||||
UniversityType = c.Type == 0 ? "综合" : c.Type == 1 ? "理工" : "医学类",
|
||||
UniversityType = UniversityTypeRelsove.GetType(c.Type),
|
||||
BuildDate = c.Build_Date,
|
||||
SubjectLevel = c.Subject_Level == 1 ? "本科" : c.Subject_Level == 2 ? "专科" : "-",
|
||||
AcademicianCount = c.Academician_Count <= 0 ? "-" : c.Academician_Count.ToString(),
|
||||
|
|
|
|||
|
|
@ -15,6 +15,10 @@ using New_College.Model.ViewModels.Result;
|
|||
using New_College.Model.ViewModels.Query;
|
||||
using New_College.Model;
|
||||
using System.Text.RegularExpressions;
|
||||
using New_College.Common.Helper;
|
||||
using Newtonsoft.Json;
|
||||
using System.IO;
|
||||
using System.Threading;
|
||||
|
||||
namespace New_College.Services
|
||||
{
|
||||
|
|
@ -51,6 +55,54 @@ namespace New_College.Services
|
|||
|
||||
|
||||
|
||||
public async Task<bool> universitydetailupdate()
|
||||
{
|
||||
var universitylist = await this.d_University.Query(x => x.IsDelete == false);
|
||||
for (int k = 0; k <= 2800; k += 20)
|
||||
{
|
||||
var jsontext = UniversityTypeRelsove.GetFileJson(string.Format(@"D:\\jsondoc\\2023-09-14\\{0}.json", k));
|
||||
var jsontext2 = Regex.Unescape(jsontext.Remove(jsontext.Length - 1, 1).Remove(0, 1));
|
||||
var result = JsonConvert.DeserializeObject<List<Class1>>(jsontext2);
|
||||
|
||||
|
||||
result.ForEach(async c =>
|
||||
{
|
||||
Thread.Sleep(100);
|
||||
if (!string.IsNullOrWhiteSpace(c.description))
|
||||
{
|
||||
if (universitylist.Any(k => k.Name == c.name.Trim())&& string.IsNullOrWhiteSpace(universitylist.FirstOrDefault(k => k.Name == c.name.Trim()).Description))
|
||||
{
|
||||
var defaultinfo = universitylist.FirstOrDefault(k => k.Name == c.name.Trim());
|
||||
|
||||
defaultinfo.Description = c.description;
|
||||
defaultinfo.ModifyTime = DateTime.Now;
|
||||
await this.d_University.Update(defaultinfo);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
public class Class1
|
||||
{
|
||||
public string id { get; set; }
|
||||
public string name { get; set; }
|
||||
public string description { get; set; }
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
|
|
@ -97,7 +149,7 @@ namespace New_College.Services
|
|||
single.Syl = issyl == "双一流" ? 1 : 0;
|
||||
single.Subject_Level = schooltype == "本科" ? 0 : 1;
|
||||
single.ModifyTime = DateTime.Now;
|
||||
await this.d_University.Update(single);
|
||||
await this.d_University.Update(single);
|
||||
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio Version 16
|
||||
VisualStudioVersion = 16.0.30114.105
|
||||
# Visual Studio Version 17
|
||||
VisualStudioVersion = 17.5.33627.172
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.Api", "New_College.Api\New_College.Api.csproj", "{6F47A41A-085E-4422-BB73-5A2CBAA07D9F}"
|
||||
EndProject
|
||||
|
|
@ -27,6 +27,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.ConsoleApp", "N
|
|||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.AdminMvc", "New_College.AdminMvc\New_College.AdminMvc.csproj", "{06D885F3-6352-4BF6-B826-DEA742DFFBD7}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "New_Spider", "New_Spider\New_Spider.csproj", "{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
|
|
@ -81,6 +83,10 @@ Global
|
|||
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
|
|||
|
|
@ -0,0 +1,172 @@
|
|||
using HtmlAgilityPack;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace New_Spider
|
||||
{
|
||||
public class HtmlAgHelper
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// 阳光高考完数据解析
|
||||
/// </summary>
|
||||
public void HtmlCreatePageData()
|
||||
{
|
||||
HtmlWeb webClient = new HtmlWeb();
|
||||
ITextGen textgen = new ITextGen();
|
||||
//for (int k = 0; k <= 2800; k += 20)
|
||||
for (int k = 0; k <= 1; k += 20)
|
||||
{
|
||||
var jsonobjs = new List<JsonObj>();
|
||||
|
||||
HtmlDocument doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-{0}.dhtml", k));
|
||||
var rootNode = doc.DocumentNode;
|
||||
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
|
||||
int categoryIndex = 0;
|
||||
foreach (var items in categoryNodeList)//
|
||||
{
|
||||
Thread.Sleep(500);
|
||||
categoryIndex++;
|
||||
|
||||
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
|
||||
|
||||
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
|
||||
|
||||
HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(),jsonobjs);
|
||||
}
|
||||
// Console.WriteLine(k);
|
||||
Thread.Sleep(1000);
|
||||
textgen.GenJson(Newtonsoft.Json.JsonConvert.SerializeObject(jsonobjs), k.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class JsonObj
|
||||
{
|
||||
public string id { get; set; }
|
||||
public string name { get; set; }
|
||||
public string description { get; set; }
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// 解析院校信息页面
|
||||
/// </summary>
|
||||
/// <param name="htmldoc"></param>
|
||||
/// <param name="htmlWeb"></param>
|
||||
/// <param name="htmlNode"></param>
|
||||
/// <param name="name"></param>
|
||||
public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode,string name, List<JsonObj> jsons)
|
||||
{
|
||||
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value));
|
||||
|
||||
var rootNode = htmldoc.DocumentNode;
|
||||
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
|
||||
|
||||
if (items != null)
|
||||
{
|
||||
foreach (var item in items)//
|
||||
{
|
||||
Thread.Sleep(500);
|
||||
try
|
||||
{
|
||||
HtmlUniversityDescriptionResolve(htmldoc, htmlWeb, item.Attributes["href"].Value, name, jsons);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// 解析院校描述模块
|
||||
/// </summary>
|
||||
/// <param name="htmldoc"></param>
|
||||
/// <param name="htmlWeb"></param>
|
||||
/// <param name="htmlNode"></param>
|
||||
/// <param name="name"></param>
|
||||
public void HtmlUniversityDescriptionResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, string url, string name, List<JsonObj> jsons)
|
||||
{
|
||||
var href = url;
|
||||
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
|
||||
var rootNode = htmldoc.DocumentNode;
|
||||
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[4]/div[3]");
|
||||
if (items != null)
|
||||
{
|
||||
foreach (var item in items)//
|
||||
{
|
||||
var description = item.InnerText; //院校简介
|
||||
|
||||
jsons.Add(new JsonObj()
|
||||
{
|
||||
description = description,
|
||||
name = name,
|
||||
id = DateTime.Now.ToString("yyyyMMHHddmmssfff"),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public void HtmlAg()
|
||||
{
|
||||
HtmlWeb webClient = new HtmlWeb();
|
||||
HtmlDocument doc = webClient.Load("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml");
|
||||
var rootNode = doc.DocumentNode;
|
||||
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
|
||||
int categoryIndex = 0;
|
||||
foreach (var items in categoryNodeList)//
|
||||
{
|
||||
|
||||
categoryIndex++;
|
||||
|
||||
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
|
||||
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
|
||||
|
||||
|
||||
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", item.Attributes["href"].Value));
|
||||
|
||||
var rootNode2 = doc.DocumentNode;
|
||||
HtmlNodeCollection categoryNodeList2 = rootNode2.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
|
||||
|
||||
foreach (var itemsd in categoryNodeList2)//
|
||||
{
|
||||
|
||||
var href = itemsd.Attributes["href"].Value;
|
||||
|
||||
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
|
||||
|
||||
var rootNode3 = doc.DocumentNode;
|
||||
|
||||
HtmlNodeCollection categoryNodeList3 = rootNode3.SelectNodes("/html/body/div[1]/div[4]/div[3]");
|
||||
|
||||
foreach (var itemsd3 in categoryNodeList3)//
|
||||
{
|
||||
var contexts = itemsd3.InnerText; //院校简介
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
using Newtonsoft.Json;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Security.AccessControl;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml;
|
||||
|
||||
namespace New_Spider
|
||||
{
|
||||
public class ITextGen
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// 生成json文件
|
||||
/// </summary>
|
||||
/// <param name="jsons"></param>
|
||||
public void GenJson(string jsons, string Ids)
|
||||
{
|
||||
string jsonString = JsonConvert.SerializeObject(jsons, Newtonsoft.Json.Formatting.Indented);
|
||||
File.WriteAllText(string.Format(@"D:\\jsondoc\\2023-09-14\\{0}.json",Ids), jsonString);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="HtmlAgilityPack" Version="1.11.53" />
|
||||
<PackageReference Include="Newtonsoft.Json" Version="12.0.3" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
using HtmlAgilityPack;
|
||||
using New_Spider;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
HtmlAgHelper htmlAgHelper = new HtmlAgHelper();
|
||||
|
||||
htmlAgHelper.HtmlCreatePageData();
|
||||
|
||||
Console.Read();
|
||||
|
||||
// See https://aka.ms/new-console-template for more information
|
||||
Console.WriteLine("Hello, World!");
|
||||
Loading…
Reference in New Issue