Добавьте файлы проекта.

This commit is contained in:
Курнат Андрей
2026-04-04 10:52:30 +03:00
parent 9b34a92f15
commit 5a55bc5f4c
30 changed files with 3446 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
using System;
using System.Collections.Generic;
using CRAWLER.Models;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal sealed class CatalogPageParser
{
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
{
var document = new HtmlDocument();
document.LoadHtml(html ?? string.Empty);
var items = new List<CatalogListItem>();
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
if (nodes == null)
{
return items;
}
foreach (var node in nodes)
{
var item = new CatalogListItem
{
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
Name = ReadBlockValue(node, "Наименование"),
TypeDesignation = ReadBlockValue(node, "Тип"),
Manufacturer = ReadBlockValue(node, "Производитель"),
VerificationInterval = ReadBlockValue(node, "МПИ"),
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
?? ReadBlockValue(node, "Свидетельство завод. номер"),
DetailUrl = ReadDetailUrl(node, baseUrl),
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
};
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
{
items.Add(item);
}
}
return items;
}
private static string ReadBlockValue(HtmlNode root, string header)
{
var block = FindBlockByHeader(root, header);
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
}
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
{
var blocks = root.SelectNodes("./div");
if (blocks == null)
{
return null;
}
foreach (var block in blocks)
{
var paragraph = block.SelectSingleNode("./p");
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
{
return block;
}
}
return null;
}
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
{
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
{
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
}