Use anly alphanumeric character in identifier (remove accents)

Correct regex
chdemko · Nov 12, 2015 · c42b0b1 · c42b0b1
1 parent ac9a096
commit c42b0b1
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 70 deletions.
diff --git a/pandoc-numbering-sample.md b/pandoc-numbering-sample.md
@@ -7,23 +7,23 @@ This is the first section
 
 Exercise #
 
-This is the first exercise. Have also a look at the [](#second).
+This is the first exercise. Have also a look at the [](#exercise:second).
 
-> Theorem (Needed for the [second exercise](#second)) #theorem1
+> Theorem (Needed for the [second exercise](#exercise:second)) #theorem:first
 > 
 > This is a the first theorem.
-> Look at the [exercise](#second "Go to the exercise #").
+> Look at the [exercise](#exercise:second "Go to the exercise #").
 
-Exercise (This is the second exercise) #second
+Exercise (This is the second exercise) #exercise:second
 
-Use [_theorem #_](#theorem1)
+Use [_theorem #_](#theorem:first)
 
 This is the second section
 ==========================
 
 > Theorem #
 > 
-> Another theorem.
+> Another theorem. Can be usefull in [](#exercise:1)
 
 Unnumbered ##
 
diff --git a/pandoc_numbering.py b/pandoc_numbering.py
@@ -4,17 +4,17 @@
 Pandoc filter to number all kinds of things.
 """
 
-from pandocfilters import walk, stringify, Str, Space, Para, Strong, Span, Link, Emph
+from pandocfilters import walk, stringify, Str, Space, Para, Strong, Span, Link, Emph, RawInline
 from functools import reduce
 import sys
 import json
 import io
 import codecs
+import re
+import unicodedata
 
 count = {}
-numbers = {}
-labels = {}
-replace = None
+information = {}
 
 def toJSONFilters(actions):
     """Converts a list of actions into a filter
@@ -34,79 +34,110 @@ def toJSONFilters(actions):
         format = ""
 
     altered = reduce(lambda x, action: walk(x, action, format, doc[0]['unMeta']), actions, doc)
-
     json.dump(altered, sys.stdout)
 
+def removeAccents(string):
+    nfkd_form = unicodedata.normalize('NFKD', string)
+    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
+
+def toIdentifier(string):
+   # replace invalid characters by dash
+   string = re.sub('[^0-9a-zA-Z_-]+', '-', removeAccents(string.lower()))
+
+   # Remove leading digits
+   string = re.sub('^[^a-zA-Z]+', '', string)
+
+   return string
+
 def numbering(key, value, format, meta):
     if key == 'Para':
         length = len(value)
         if length >= 3 and value[length - 2] == Space() and value[length - 1]['t'] == 'Str':
             last = value[length - 1]['c']
-            if last[0] == '#':
-                tag = last
 
+            if re.match('^#([a-zA-Z][\w:.-]*)?$', last):
                 # Is it a Para and the last element is an identifier beginning with '#'
-                if len(last) == 1 or last[1] != '#':
-                    global count, numbers, labels
-
-                    # Detect the title
-                    title = None
-                    if value[length - 3]['t'] == 'Str' and value[length - 3]['c'][-1:] == ')':
-                        for (i, item) in enumerate(value):
-                            if item['t'] == 'Str' and item['c'][0] == '(':
-                                title = Emph(value[i - 1:length - 2])
-                                value = value[:i - 1] + value[length - 2:]
-                                length = i + 1
-                                break
-
-                    # Convert the value to a category (eliminating the '#')
-                    category = stringify(value[:length - 2])
-                    if category not in count:
-                        count[category] = 0
-                    count[category] = count[category] + 1
-
-                    # Replace the '#' by the category count
-                    value[length - 1]['c'] = str(count[category])
-
-                    # Prepare the final text
-                    text = [Strong(value)]
-
-                    # Add the title to the final text
-                    if title != None:
-                        text.append(title)
-
-                    if tag != '#':
-                        # Store the numbers and the label for automatic numbering (See referencing function)
-                        numbers[tag] = value[length - 1]['c']
-                        labels[tag] = value
-
-                        # Return the final text in a Span element embedded in a Para element
-                        return Para([Span([tag[1:], [], []], text)])
-                    else:
-                        # Return the final text in a Para element
-                        return Para(text)
+                global count, information
+
+                # Detect the title
+                title = None
+                if value[length - 3]['t'] == 'Str' and value[length - 3]['c'][-1:] == ')':
+                    for (i, item) in enumerate(value):
+                        if item['t'] == 'Str' and item['c'][0] == '(':
+                            title = Emph(value[i:length - 2])
+                            value = value[:i - 1] + value[length - 2:]
+                            length = i + 1
+                            break
+
+                # Convert the value to a name (eliminating the '#')
+                name = toIdentifier(stringify(value[:length - 2]))
+
+                # Is it a new category?
+                if name not in count:
+                    count[name] = 0
+
+                count[name] = count[name] + 1
+
+                # Get the number
+                number = str(count[name])
+
+                # Determine the tag
+                if last == '#':
+                    tag = name + ':' + number
                 else:
-                    # Special case where the last element is '##...'
-                    value[length - 1]['c'] = value[length - 1]['c'].replace('##', '#', 1)
-                    return Para(value)
+                    tag = last[1:]
+
+                # Replace the '#' by the name count
+                value[length - 1]['c'] = number
+
+                # Prepare the final text
+                text = [Strong(value)]
+
+                # Add the title to the final text
+                if title != None:
+                    text.append(Space())
+                    text.append(title)
+
+                # Store the numbers and the label for automatic numbering (See referencing function)
+                information[tag] = {'number': number, 'text': value}
+
+                # Prepare the contents
+                contents = [Span([tag, [], []], text)]
+
+                # Special cas for LaTeX
+                if format == 'latex':
+                    contents.insert(0, RawInline('tex', '\\phantomsection'))
+
+                # Return the contents in a Para element
+                return Para(contents)
+            elif re.match('^##([a-zA-Z][\w:.-]*)?$', last):
+                # Special case where the last element is '##...'
+                value[length - 1]['c'] = value[length - 1]['c'].replace('##', '#', 1)
+                return Para(value)
+
+replace = None
 
 def referencing(key, value, format, meta):
-    global numbers, labels, replace
+    global information, replace
 
-    # Is it a link with a right tag?
+    # Is it a link with a right reference?
     if key == 'Link':
-        [text, [identifier, title]] = value
-        if identifier in numbers:
-            # Replace all '#' with the corresponding number in the title
-            value[1][1] = title.replace('#', numbers[identifier])
-
-            if text == []:
-                # The link text is empty, replace it with the default label
-                value[0] = labels[identifier]
-            else:
-                # The link text is not empty, replace all '#' with the corresponding number
-                replace = numbers[identifier]
-                value[0] = walk(text, replacing, format, meta)
+        [text, [reference, title]] = value
+        if re.match('^#([a-zA-Z][\w:.-]*)?$', reference):
+            # Compute the name
+            tag = reference[1:]
+
+            if tag in information:
+                # Replace all '#' with the corresponding number in the title
+                value[1][1] = title.replace('#', information[tag]['number'])
+
+                if text == []:
+                    # The link text is empty, replace it with the default label
+                    value[0] = information[tag]['text']
+                else:
+                    # The link text is not empty, replace all '#' with the corresponding number
+                    replace = information[tag]['number']
+                    value[0] = walk(text, replacing, format, meta)
 
 def replacing(key, value, format, meta):
     global replace

diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.3.1',
+    version='0.3.2',
 
     # The project's description
     description='A pandoc filter for automatic numbering',