using System; using System.Text.RegularExpressions; using HtmlAgilityPack; namespace CRAWLER.Parsing; internal static class HtmlParsingHelpers { public static string NormalizeWhitespace(string value) { if (string.IsNullOrWhiteSpace(value)) { return null; } var decoded = HtmlEntity.DeEntitize(value); decoded = decoded.Replace('\u00A0', ' '); decoded = Regex.Replace(decoded, @"\s+", " "); return decoded.Trim(); } public static string NormalizeLabel(string value) { return NormalizeWhitespace(value) ?.Replace(" :", ":") .Trim(':', ' '); } public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath) { if (string.IsNullOrWhiteSpace(urlOrPath)) { return null; } if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri)) { return absoluteUri.ToString(); } var baseUri = new Uri(baseUrl.TrimEnd('/') + "/"); return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString(); } public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node) { if (node == null) { return null; } var clone = node.CloneNode(true); var paragraphs = clone.SelectNodes("./p"); if (paragraphs != null) { foreach (var paragraph in paragraphs) { paragraph.Remove(); } } return NormalizeWhitespace(clone.InnerText); } }