feat: convert mime_type field to its own model. (#129)

This commit normalizes the Media/MIME type in RawContent to be a foreign key to a new table rather than a text string. This was done for a couple of reasons: 1. Recent problems with edx-platform's courseware_studentmodule table have driven home the point that we should try to reduce index sizes where possible, and RawContent is likely to grow into a very large table in time. 2. Media type designations can sometimes change, and that's much easier to do as a data migration if it's in its own lookup table, rather than affecting millions of rows in RawContent. In order to support these changes, I've also added LRU caching for MediaType lookup... which in turn required a test helper that will clear out the values between test calls. That's the rationale behind our new openedx_learning.lib.test_utils.TestCase.
openedx · Dec 5, 2023 · b629e49 · b629e49
1 parent ad71bfa
commit b629e49
Show file tree

Hide file tree

Showing 16 changed files with 255 additions and 34 deletions.
diff --git a/openedx_learning/__init__.py b/openedx_learning/__init__.py
@@ -1,4 +1,4 @@
 """
 Open edX Learning ("Learning Core").
 """
-__version__ = "0.3.7"
+__version__ = "0.4.0"
diff --git a/openedx_learning/core/components/admin.py b/openedx_learning/core/components/admin.py
@@ -158,7 +158,7 @@ def content_preview(cvc_obj: ComponentVersionRawContent) -> SafeText:
     """
     raw_content_obj = cvc_obj.raw_content
 
-    if raw_content_obj.mime_type.startswith("image/"):
+    if raw_content_obj.media_type.type == "image":
         return format_html(
             '<img src="{}" style="max-width: 100%;" />',
             # TODO: configure with settings value:

diff --git a/openedx_learning/core/components/api.py b/openedx_learning/core/components/api.py
@@ -117,6 +117,7 @@ def get_component_version_content(
     """
     return ComponentVersionRawContent.objects.select_related(
         "raw_content",
+        "raw_content__media_type",
         "component_version",
         "component_version__component",
         "component_version__component__learning_package",

diff --git a/openedx_learning/core/components/migrations/0001_initial.py b/openedx_learning/core/components/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.19 on 2023-06-15 14:43
+# Generated by Django 3.2.23 on 2023-12-04 00:41
 
 import uuid
 
@@ -13,7 +13,7 @@ class Migration(migrations.Migration):
     initial = True
 
     dependencies = [
-        ('oel_publishing', '0001_initial'),
+        ('oel_publishing', '0002_alter_fk_on_delete'),
         ('oel_contents', '0001_initial'),
     ]
 

diff --git a/openedx_learning/core/contents/admin.py b/openedx_learning/core/contents/admin.py
@@ -18,14 +18,14 @@ class RawContentAdmin(ReadOnlyModelAdmin):
         "hash_digest",
         "file_link",
         "learning_package",
-        "mime_type",
+        "media_type",
         "size",
         "created",
     ]
     fields = [
         "learning_package",
         "hash_digest",
-        "mime_type",
+        "media_type",
         "size",
         "created",
         "file_link",
@@ -34,13 +34,13 @@ class RawContentAdmin(ReadOnlyModelAdmin):
     readonly_fields = [
         "learning_package",
         "hash_digest",
-        "mime_type",
+        "media_type",
         "size",
         "created",
         "file_link",
         "text_preview",
     ]
-    list_filter = ("mime_type", "learning_package")
+    list_filter = ("media_type", "learning_package")
     search_fields = ("hash_digest",)
 
     def file_link(self, raw_content):

diff --git a/openedx_learning/core/contents/api.py b/openedx_learning/core/contents/api.py
@@ -12,9 +12,10 @@
 from django.core.files.base import ContentFile
 from django.db.transaction import atomic
 
+from openedx_learning.lib.cache import lru_cache
 from openedx_learning.lib.fields import create_hash_digest
 
-from .models import RawContent, TextContent
+from .models import MediaType, RawContent, TextContent
 
 
 def create_raw_content(
@@ -28,9 +29,10 @@ def create_raw_content(
     Create a new RawContent instance and persist it to storage.
     """
     hash_digest = hash_digest or create_hash_digest(data_bytes)
+
     raw_content = RawContent.objects.create(
         learning_package_id=learning_package_id,
-        mime_type=mime_type,
+        media_type_id=get_media_type_id(mime_type),
         hash_digest=hash_digest,
         size=len(data_bytes),
         created=created,
@@ -54,6 +56,39 @@ def create_text_from_raw_content(raw_content: RawContent, encoding="utf-8-sig")
     )
 
 
+@lru_cache(maxsize=128)
+def get_media_type_id(mime_type: str) -> int:
+    """
+    Return the MediaType.id for the desired mime_type string.
+
+    If it is not found in the database, a new entry will be created for it. This
+    lazy-writing means that MediaType entry IDs will *not* be the same across
+    different server instances, and apps should not assume that will be the
+    case. Even if we were to preload a bunch of common ones, we can't anticipate
+    the different XBlocks that will be installed in different server instances,
+    each of which will use their own MediaType.
+
+    This will typically only be called when create_raw_content is calling it to
+    lookup the media_type_id it should use for a new RawContent. If you already
+    have a RawContent instance, it makes much more sense to access its
+    media_type relation.
+    """
+    if "+" in mime_type:
+        base, suffix = mime_type.split("+")
+    else:
+        base = mime_type
+        suffix = ""
+
+    main_type, sub_type = base.split("/")
+    mt, _created = MediaType.objects.get_or_create(
+        type=main_type,
+        sub_type=sub_type,
+        suffix=suffix,
+    )
+
+    return mt.id
+
+
 def get_or_create_raw_content(
     learning_package_id: int,
     data_bytes: bytes,

diff --git a/openedx_learning/core/contents/migrations/0001_initial.py b/openedx_learning/core/contents/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.19 on 2023-06-15 14:43
+# Generated by Django 3.2.23 on 2023-12-04 00:41
 
 import django.core.validators
 import django.db.models.deletion
@@ -13,20 +13,29 @@ class Migration(migrations.Migration):
     initial = True
 
     dependencies = [
-        ('oel_publishing', '0001_initial'),
+        ('oel_publishing', '0002_alter_fk_on_delete'),
     ]
 
     operations = [
+        migrations.CreateModel(
+            name='MediaType',
+            fields=[
+                ('id', models.AutoField(primary_key=True, serialize=False)),
+                ('type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
+                ('sub_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
+                ('suffix', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
+            ],
+        ),
         migrations.CreateModel(
             name='RawContent',
             fields=[
                 ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                 ('hash_digest', models.CharField(editable=False, max_length=40)),
-                ('mime_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=255)),
                 ('size', models.PositiveBigIntegerField(validators=[django.core.validators.MaxValueValidator(50000000)])),
                 ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])),
                 ('file', models.FileField(null=True, upload_to='')),
                 ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')),
+                ('media_type', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='oel_contents.mediatype')),
             ],
             options={
                 'verbose_name': 'Raw Content',
@@ -41,10 +50,13 @@ class Migration(migrations.Migration):
                 ('length', models.PositiveIntegerField()),
             ],
         ),
-        # Call out to custom code here to change row format for TextContent
+        migrations.AddConstraint(
+            model_name='mediatype',
+            constraint=models.UniqueConstraint(fields=('type', 'sub_type', 'suffix'), name='oel_contents_uniq_t_st_sfx'),
+        ),
         migrations.AddIndex(
             model_name='rawcontent',
-            index=models.Index(fields=['learning_package', 'mime_type'], name='oel_content_idx_lp_mime_type'),
+            index=models.Index(fields=['learning_package', 'media_type'], name='oel_content_idx_lp_media_type'),
         ),
         migrations.AddIndex(
             model_name='rawcontent',
@@ -56,6 +68,6 @@ class Migration(migrations.Migration):
         ),
         migrations.AddConstraint(
             model_name='rawcontent',
-            constraint=models.UniqueConstraint(fields=('learning_package', 'mime_type', 'hash_digest'), name='oel_content_uniq_lc_mime_type_hash_digest'),
+            constraint=models.UniqueConstraint(fields=('learning_package', 'media_type', 'hash_digest'), name='oel_content_uniq_lc_media_type_hash_digest'),
         ),
     ]
diff --git a/openedx_learning/core/contents/models.py b/openedx_learning/core/contents/models.py
@@ -3,6 +3,8 @@
 the simplest building blocks to store data with. They need to be composed into
 more intelligent data models to be useful.
 """
+from functools import cached_property
+
 from django.conf import settings
 from django.core.files.storage import default_storage
 from django.core.validators import MaxValueValidator
@@ -18,6 +20,81 @@
 from ..publishing.models import LearningPackage
 
 
+class MediaType(models.Model):
+    """
+    Stores Media types for use by RawContent models.
+
+    This is the same as MIME types (the IANA renamed MIME Types to Media Types).
+    We don't pre-populate this table, so APIs that add RawContent must ensure
+    that the desired Media Type exists.
+
+    Media types are written as {type}/{sub_type}+{suffix}, where suffixes are
+    seldom used.
+
+    * application/json
+    * text/css
+    * image/svg+xml
+    * application/vnd.openedx.xblock.v1.problem+xml
+
+    We have this as a separate model (instead of a field on RawContent) because:
+
+    1. We can save a lot on storage and indexing for RawContent if we're just
+       storing foreign key references there, rather than the entire content
+       string to be indexed. This is especially relevant for our (long) custom
+       types like "application/vnd.openedx.xblock.v1.problem+xml".
+    2. These values can occasionally change. For instance, "text/javascript" vs.
+       "application/javascript". Also, we will be using a fair number of "vnd."
+       style of custom content types, and we may want the flexibility of
+       changing that without having to worry about migrating millions of rows of
+       RawContent.
+    """
+    # We're going to have many foreign key references from RawContent into this
+    # model, and we don't need to store those as 8-byte BigAutoField, as is the
+    # default for this app. It's likely that a SmallAutoField would work, but I
+    # can just barely imagine using more than 32K Media types if we have a bunch
+    # of custom "vnd." entries, or start tracking suffixes and parameters. Which
+    # is how we end up at the 4-byte AutoField.
+    id = models.AutoField(primary_key=True)
+
+    # Media types are denoted as {type}/{sub_type}+{suffix}. We currently do not
+    # support parameters.
+
+    # Media type, e.g. "application", "text", "image". Per RFC 4288, this can be
+    # at most 127 chars long and is case insensitive. In practice, it's almost
+    # always written in lowercase.
+    type = case_insensitive_char_field(max_length=127, blank=False, null=False)
+
+    # Media sub-type, e.g. "json", "css", "png". Per RFC 4288, this can be at
+    # most 127 chars long and is case insensitive. In practice, it's almost
+    # always written in lowercase.
+    sub_type = case_insensitive_char_field(max_length=127, blank=False, null=False)
+
+    # Suffix, usually just "xml" (e.g. "image/svg+xml"). Usually blank. I
+    # couldn't find an RFC description of the length limit, and 127 is probably
+    # excessive. But this table should be small enough where it doesn't really
+    # matter.
+    suffix = case_insensitive_char_field(max_length=127, blank=True, null=False)
+
+    class Meta:
+        constraints = [
+            # Make sure all (type + sub_type + suffix) combinations are unique.
+            models.UniqueConstraint(
+                fields=[
+                    "type",
+                    "sub_type",
+                    "suffix",
+                ],
+                name="oel_contents_uniq_t_st_sfx",
+            ),
+        ]
+
+    def __str__(self):
+        base = f"{self.type}/{self.sub_type}"
+        if self.suffix:
+            return f"{base}+{self.suffix}"
+        return base
+
+
 class RawContent(models.Model):  # type: ignore[django-manager-missing]
     """
     This is the most basic piece of raw content data, with no version metadata.
@@ -77,15 +154,8 @@ class RawContent(models.Model):  # type: ignore[django-manager-missing]
     # openedx.lib.fields module.
     hash_digest = hash_field()
 
-    # MIME type, such as "text/html", "image/png", etc. Per RFC 4288, MIME type
-    # and sub-type may each be 127 chars, making a max of 255 (including the "/"
-    # in between). Also, while MIME types are almost always written in lowercase
-    # as a matter of convention, by spec they are NOT case sensitive.
-    #
-    # DO NOT STORE parameters here, e.g. "charset=". We can make a new field if
-    # that becomes necessary. If we do decide to store parameters and values
-    # later, note that those *may be* case sensitive.
-    mime_type = case_insensitive_char_field(max_length=255, blank=False)
+    # What is the Media type (a.k.a. MIME type) of this data?
+    media_type = models.ForeignKey(MediaType, on_delete=models.PROTECT)
 
     # This is the size of the raw data file in bytes. This can be different than
     # the character length, since UTF-8 encoding can use anywhere between 1-4
@@ -107,28 +177,32 @@ class RawContent(models.Model):  # type: ignore[django-manager-missing]
         storage=settings.OPENEDX_LEARNING.get("STORAGE", default_storage),  # type: ignore
     )
 
+    @cached_property
+    def mime_type(self):
+        return str(self.media_type)
+
     class Meta:
         constraints = [
             # Make sure we don't store duplicates of this raw data within the
             # same LearningPackage, unless they're of different MIME types.
             models.UniqueConstraint(
                 fields=[
                     "learning_package",
-                    "mime_type",
+                    "media_type",
                     "hash_digest",
                 ],
-                name="oel_content_uniq_lc_mime_type_hash_digest",
+                name="oel_content_uniq_lc_media_type_hash_digest",
             ),
         ]
         indexes = [
-            # LearningPackage MIME type Index:
+            # LearningPackage Media type Index:
             #   * Break down Content counts by type/subtype with in a
             #     LearningPackage.
             #   * Find all the Content in a LearningPackage that matches a
             #     certain MIME type (e.g. "image/png", "application/pdf".
             models.Index(
-                fields=["learning_package", "mime_type"],
-                name="oel_content_idx_lp_mime_type",
+                fields=["learning_package", "media_type"],
+                name="oel_content_idx_lp_media_type",
             ),
             # LearningPackage (reverse) Size Index:
             #   * Find largest Content in a LearningPackage.

diff --git a/openedx_learning/lib/cache.py b/openedx_learning/lib/cache.py
@@ -0,0 +1,34 @@
+"""
+Test-friendly helpers for caching.
+
+LRU caching can be especially helpful for Learning Core data models because so
+much of it is immutable, e.g. Media Type lookups. But while these data models
+are immutable within a given site, we also want to be able to track and clear
+these caches across test runs. Later on, we may also want to inspect them to
+make sure they are not growing overly large.
+"""
+import functools
+
+# List of functions that have our
+_lru_cached_fns = []
+
+
+def lru_cache(*args, **kwargs):
+    """
+    Thin wrapper over functools.lru_cache that lets us clear all caches later.
+    """
+    def decorator(fn):
+        wrapped_fn = functools.lru_cache(*args, **kwargs)(fn)
+        _lru_cached_fns.append(wrapped_fn)
+        return wrapped_fn
+    return decorator
+
+
+def clear_lru_caches():
+    """
+    Clear all LRU caches that use our lru_cache decorator.
+
+    Useful for tests.
+    """
+    for fn in _lru_cached_fns:
+        fn.cache_clear()
diff --git a/openedx_learning/lib/fields.py b/openedx_learning/lib/fields.py
@@ -121,6 +121,16 @@ def hash_field() -> models.CharField:
 
     Use the create_hash_digest function to generate data suitable for this
     field.
+
+    There are a couple of ways that we could have stored this more efficiently,
+    but we don't at this time:
+
+    1. A BinaryField would be the most space efficient, but Django doesn't
+       support indexing a BinaryField in a MySQL database.
+    2. We could make the field case-sensitive and run it through a URL-safe
+       base64 encoding. But the amount of space this saves vs. the complexity
+       didn't seem worthwhile, particularly the possibility of case-sensitivity
+       related bugs.
     """
     return models.CharField(
         max_length=40,