Files
CROWLER/Parsing/HtmlParsingHelpers.cs
2026-04-04 10:52:30 +03:00

65 lines
1.6 KiB
C#

using System;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal static class HtmlParsingHelpers
{
public static string NormalizeWhitespace(string value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
}
var decoded = HtmlEntity.DeEntitize(value);
decoded = decoded.Replace('\u00A0', ' ');
decoded = Regex.Replace(decoded, @"\s+", " ");
return decoded.Trim();
}
public static string NormalizeLabel(string value)
{
return NormalizeWhitespace(value)
?.Replace(" :", ":")
.Trim(':', ' ');
}
public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath)
{
if (string.IsNullOrWhiteSpace(urlOrPath))
{
return null;
}
if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri))
{
return absoluteUri.ToString();
}
var baseUri = new Uri(baseUrl.TrimEnd('/') + "/");
return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString();
}
public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node)
{
if (node == null)
{
return null;
}
var clone = node.CloneNode(true);
var paragraphs = clone.SelectNodes("./p");
if (paragraphs != null)
{
foreach (var paragraph in paragraphs)
{
paragraph.Remove();
}
}
return NormalizeWhitespace(clone.InnerText);
}
}