To make it easy for a client to add in related links to pages like a Blog Post or Article, I like implementing some form of automation so there is one less thing to content manage. For a Kentico Cloud project, I took this very approach. I created a UrlHelper class that will carry out the following:
- Take in an absolute URL.
- Read the markup of the page.
- Selects the title tag using Regex.
- Remove the site name prefix from title text.
using Microsoft.Extensions.Caching.Memory;
using MyProject.Models.Site;
using System;
using System.IO;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
namespace MyProject.Helpers
{
public class UrlHelper
{
private static IMemoryCache _cache;
public UrlHelper(IMemoryCache memCache)
{
_cache = memCache;
}
/// <summary>
/// Returns the a title and URL of the link directly from a page.
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public PageLink GetPageTitleFromUrl(string url)
{
if (!string.IsNullOrEmpty(url))
{
if (_cache.TryGetValue(url, out PageLink page))
{
return page;
}
else
{
using (WebClient client = new WebClient())
{
try
{
Stream stream = client.OpenRead(url);
StreamReader streamReader = new StreamReader(stream, System.Text.Encoding.GetEncoding("UTF-8"));
// Get contents of the page.
string pageHtml = streamReader.ReadToEnd();
if (!string.IsNullOrEmpty(pageHtml))
{
// Get the title.
string title = Regex.Match(pageHtml, @"\<title\b[^>]*\>\s*(?<Title>[\s\S]*?)\</title\>", RegexOptions.IgnoreCase).Groups["Title"].Value;
if (!string.IsNullOrEmpty(title))
{
if (title.Contains("|"))
title = title.Split("|").First();
else if (title.Contains(":"))
title = title.Split(":").First();
PageLink pageLink = new PageLink
{
PageName = title,
PageUrl = url
};
_cache.Set(url, pageLink, DateTimeOffset.Now.AddHours(12));
page = pageLink;
}
}
// Cleanup.
stream.Flush();
stream.Close();
client.Dispose();
}
catch (WebException e)
{
throw e;
}
}
}
return page;
}
else
{
return null;
}
}
}
}
The method returns a PageLink object:
namespace MyProject.Models.Site
{
public class PageLink
{
public string PageName { get; set; }
public string PageUrl { get; set; }
}
}
From an efficiency standpoint, I cache the process for 12 hours as going through the process of reading the markup of a page can be quite expensive if there is a lot of HTML.