Skip to content

Commit

Permalink
cont
Browse files Browse the repository at this point in the history
  • Loading branch information
BobLd committed Nov 27, 2024
1 parent c734c5a commit 98220ef
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 40 deletions.
75 changes: 38 additions & 37 deletions src/UglyToad.PdfPig.Fonts/GlyphList.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
public class GlyphList
{
/// <summary>
/// <c>.notdef</c>.
/// <c>.notdef</c> name.
/// </summary>
public const string NotDefined = ".notdef";

Expand All @@ -37,17 +37,7 @@ public class GlyphList
public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value;

private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));

private static readonly HashSet<string> OddLigaturesNames =
[
// See https://en.wikipedia.org/wiki/Ligature_(writing)
"f_f", "f_i", "f_j", "f_l", "f_a", "f_e", "f_o", "f_r", "f_s", "f_t", "f_b", "f_h",
"f_u", "f_y", "f_.", "f_,", "f_-",
"f_f_i", "f_f_l",
// Sometimes, ligatures for ⟨st⟩ (st), ⟨ſt⟩ (ſt), ⟨ch⟩, ⟨ck⟩, ⟨ct⟩, ⟨Qu⟩ and ⟨Th⟩ are used (e.g. in the typeface Linux Libertine).
"s_t", "ſ_t", "c_h", "c_k", "c_t", "Q_u", "T_h"
]; // TODO - Go use for FrozenSet


/// <summary>
/// Zapf Dingbats.
/// </summary>
Expand Down Expand Up @@ -94,6 +84,7 @@ public string UnicodeCodePointToName(int unicodeValue)

/// <summary>
/// Get the unicode value for the glyph name.
/// See <see href="https://github.com/adobe-type-tools/agl-specification"/>.
/// </summary>
public string NameToUnicode(string name)
{
Expand All @@ -113,25 +104,47 @@ public string NameToUnicode(string name)
}

string unicode;
// Remove suffixes
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
if (name.IndexOf('.') > 0)
{
unicode = NameToUnicode(name.Substring(0, name.IndexOf('.')));
}
else if (name.StartsWith("uni") && name.Length == 7)
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
else if (name.IndexOf('_') > 0)
{
/*
* MOZILLA-3136-0.pdf
* 68-1990-01_A.pdf
* TIKA-2054-0.pdf
*/
var sb = new StringBuilder();
foreach (var s in name.Split('_'))
{
sb.Append(NameToUnicode(s));
}

unicode = sb.ToString();
}
// Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069) followed by a sequence of uppercase hexadecimal
// digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), if the length of that sequence is a multiple
// of four, and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF, then
// interpret each as a Unicode scalar value and map the component to the string made of those scalar values. Note that the range
// and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
else if (name.StartsWith("uni") && (name.Length - 3) % 4 == 0)
{
// test for Unicode name in the format uniXXXX where X is hex
int nameLength = name.Length;

var uniStr = new StringBuilder();

var foundUnicode = true;
for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4)
{
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var codePoint))
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4),
NumberStyles.HexNumber,
CultureInfo.InvariantCulture,
out var codePoint))
{
foundUnicode = false;
break;
return null;
}

if (codePoint > 0xD7FF && codePoint < 0xE000)
Expand All @@ -142,42 +155,30 @@ public string NameToUnicode(string name)
uniStr.Append((char)codePoint);
}

if (!foundUnicode)
{
return null;
}

unicode = uniStr.ToString();
}
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length == 5)
// Otherwise, if the component is of the form ‘u’ (U+0075) followed by a sequence of four to six uppercase hexadecimal digits (0–9
// and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), and those digits represents a value in the ranges 0000 through
// D7FF or E000 through 10FFFF, then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length >= 5 && name.Length <= 7)
{
// test for an alternate Unicode name representation uXXXX
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.HexNumber, CultureInfo.InvariantCulture);

if (codePoint > 0xD7FF && codePoint < 0xE000)
{
throw new InvalidFontFormatException(
$"Unicode character name with disallowed code area: {name}");
throw new InvalidFontFormatException($"Unicode character name with disallowed code area: {name}");
}

unicode = char.ConvertFromUtf32(codePoint);
}
// Ad-hoc special cases
else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
{
// name representation cXXX
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
System.Diagnostics.Debug.Assert(codePoint > 0);
unicode = char.ConvertFromUtf32(codePoint);
}
else if (name.IndexOf('_') > 0 && OddLigaturesNames.Contains(name))
{
/*
* MOZILLA-3136-0.pdf
* 68-1990-01_A.pdf
* TIKA-2054-0.pdf
*/
unicode = name.Replace("_", "");
}
// Otherwise, map the component to an empty string.
else
{
return null;
Expand Down
21 changes: 18 additions & 3 deletions src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public void NameToUnicodeRemovesSuffix()
{
var list = new GlyphList(new Dictionary<string, string>
{
{"Boris", "B"}
{ "Boris", "B" }
});

var result = list.NameToUnicode("Boris.Special");
Expand All @@ -70,7 +70,7 @@ public void NameToUnicodeConvertsHexAndUsesHexValue()
{
var list = new GlyphList(new Dictionary<string, string>
{
{"B", "X"}
{ "B", "X" }
});

var result = list.NameToUnicode("uni0042");
Expand All @@ -83,12 +83,27 @@ public void NameToUnicodeConvertsShortHexAndUsesHexValue()
{
var list = new GlyphList(new Dictionary<string, string>
{
{"E", "Æ"}
{ "E", "Æ" }
});

var result = list.NameToUnicode("u0045");

Assert.Equal("E", result);
}


[Fact(Skip = "TODO - String don't match")]
public void NameToUnicodeConvertAglSpecification()
{
// https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples
var list = new GlyphList(new Dictionary<string, string>
{
{ "Lcommaaccent", "\u013B" }
});

var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate");

Assert.Equal("\u013B\u20AC\u0308\u1040C", result);
}
}
}

0 comments on commit 98220ef

Please sign in to comment.