Добавьте файлы проекта.

This commit is contained in:
Курнат Андрей
2026-04-04 10:52:30 +03:00
parent 9b34a92f15
commit 5a55bc5f4c
30 changed files with 3446 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
using System;
using System.Collections.Generic;
using CRAWLER.Models;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal sealed class CatalogPageParser
{
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
{
var document = new HtmlDocument();
document.LoadHtml(html ?? string.Empty);
var items = new List<CatalogListItem>();
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
if (nodes == null)
{
return items;
}
foreach (var node in nodes)
{
var item = new CatalogListItem
{
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
Name = ReadBlockValue(node, "Наименование"),
TypeDesignation = ReadBlockValue(node, "Тип"),
Manufacturer = ReadBlockValue(node, "Производитель"),
VerificationInterval = ReadBlockValue(node, "МПИ"),
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
?? ReadBlockValue(node, "Свидетельство завод. номер"),
DetailUrl = ReadDetailUrl(node, baseUrl),
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
};
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
{
items.Add(item);
}
}
return items;
}
private static string ReadBlockValue(HtmlNode root, string header)
{
var block = FindBlockByHeader(root, header);
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
}
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
{
var blocks = root.SelectNodes("./div");
if (blocks == null)
{
return null;
}
foreach (var block in blocks)
{
var paragraph = block.SelectSingleNode("./p");
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
{
return block;
}
}
return null;
}
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
{
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
{
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
}

View File

@@ -0,0 +1,65 @@
using System;
using System.Collections.Generic;
using CRAWLER.Models;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal sealed class DetailPageParser
{
public ParsedInstrumentDetails Parse(string html, string baseUrl)
{
var document = new HtmlDocument();
document.LoadHtml(html ?? string.Empty);
var rows = document.DocumentNode.SelectNodes("//table[contains(@class,'resulttable1')]//tr");
var values = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
if (rows != null)
{
foreach (var row in rows)
{
var cells = row.SelectNodes("./td");
if (cells == null || cells.Count < 2)
{
continue;
}
var label = HtmlParsingHelpers.NormalizeLabel(cells[0].InnerText);
var value = HtmlParsingHelpers.NormalizeWhitespace(cells[1].InnerText);
if (!string.IsNullOrWhiteSpace(label))
{
values[label] = value;
}
}
}
return new ParsedInstrumentDetails
{
RegistryNumber = Get(values, "Номер в госреестре"),
Name = Get(values, "Наименование"),
TypeDesignation = Get(values, "Обозначение типа"),
Manufacturer = Get(values, "Производитель"),
VerificationInterval = Get(values, "Межповерочный интервал (МПИ)"),
CertificateOrSerialNumber = Get(values, "Срок свидетельства или заводской номер"),
AllowsBatchVerification = Get(values, "Допускается поверка партии"),
HasPeriodicVerification = Get(values, "Наличие периодической поверки"),
TypeInfo = Get(values, "Сведения о типе"),
Purpose = Get(values, "Назначение"),
Description = Get(values, "Описание"),
Software = Get(values, "Программное обеспечение"),
MetrologicalCharacteristics = Get(values, "Метрологические и технические характеристики"),
Completeness = Get(values, "Комплектность"),
Verification = Get(values, "Поверка"),
RegulatoryDocuments = Get(values, "Нормативные и технические документы"),
Applicant = Get(values, "Заявитель"),
TestCenter = Get(values, "Испытательный центр"),
DescriptionTypePdfUrl = HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, document.DocumentNode.SelectSingleNode("//a[contains(@href,'/prof/opisanie/')]")?.GetAttributeValue("href", null)),
MethodologyPdfUrl = HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, document.DocumentNode.SelectSingleNode("//a[contains(@href,'/prof/metodiki/')]")?.GetAttributeValue("href", null))
};
}
private static string Get(IDictionary<string, string> values, string key)
{
return values.TryGetValue(key, out var value) ? value : null;
}
}

View File

@@ -0,0 +1,64 @@
using System;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal static class HtmlParsingHelpers
{
public static string NormalizeWhitespace(string value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
}
var decoded = HtmlEntity.DeEntitize(value);
decoded = decoded.Replace('\u00A0', ' ');
decoded = Regex.Replace(decoded, @"\s+", " ");
return decoded.Trim();
}
public static string NormalizeLabel(string value)
{
return NormalizeWhitespace(value)
?.Replace(" :", ":")
.Trim(':', ' ');
}
public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath)
{
if (string.IsNullOrWhiteSpace(urlOrPath))
{
return null;
}
if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri))
{
return absoluteUri.ToString();
}
var baseUri = new Uri(baseUrl.TrimEnd('/') + "/");
return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString();
}
public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node)
{
if (node == null)
{
return null;
}
var clone = node.CloneNode(true);
var paragraphs = clone.SelectNodes("./p");
if (paragraphs != null)
{
foreach (var paragraph in paragraphs)
{
paragraph.Remove();
}
}
return NormalizeWhitespace(clone.InnerText);
}
}