87 lines
3.1 KiB
C#
87 lines
3.1 KiB
C#
using System;
|
||
using System.Collections.Generic;
|
||
using CRAWLER.Models;
|
||
using HtmlAgilityPack;
|
||
|
||
namespace CRAWLER.Parsing;
|
||
|
||
internal sealed class CatalogPageParser
|
||
{
|
||
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
|
||
{
|
||
var document = new HtmlDocument();
|
||
document.LoadHtml(html ?? string.Empty);
|
||
|
||
var items = new List<CatalogListItem>();
|
||
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
|
||
if (nodes == null)
|
||
{
|
||
return items;
|
||
}
|
||
|
||
foreach (var node in nodes)
|
||
{
|
||
var item = new CatalogListItem
|
||
{
|
||
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
|
||
Name = ReadBlockValue(node, "Наименование"),
|
||
TypeDesignation = ReadBlockValue(node, "Тип"),
|
||
Manufacturer = ReadBlockValue(node, "Производитель"),
|
||
VerificationInterval = ReadBlockValue(node, "МПИ"),
|
||
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
|
||
?? ReadBlockValue(node, "Свидетельство завод. номер"),
|
||
DetailUrl = ReadDetailUrl(node, baseUrl),
|
||
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
|
||
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
|
||
};
|
||
|
||
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
|
||
{
|
||
items.Add(item);
|
||
}
|
||
}
|
||
|
||
return items;
|
||
}
|
||
|
||
private static string ReadBlockValue(HtmlNode root, string header)
|
||
{
|
||
var block = FindBlockByHeader(root, header);
|
||
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
|
||
}
|
||
|
||
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
|
||
{
|
||
var blocks = root.SelectNodes("./div");
|
||
if (blocks == null)
|
||
{
|
||
return null;
|
||
}
|
||
|
||
foreach (var block in blocks)
|
||
{
|
||
var paragraph = block.SelectSingleNode("./p");
|
||
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
|
||
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
return block;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
|
||
{
|
||
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
|
||
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
|
||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||
}
|
||
|
||
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
|
||
{
|
||
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
|
||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||
}
|
||
}
|