Skip to content

Commit

Permalink
feat: convert mime_type field to its own model. (#129)
Browse files Browse the repository at this point in the history
This commit normalizes the Media/MIME type in RawContent to be a foreign
key to a new table rather than a text string. This was done for a couple
of reasons:

1. Recent problems with edx-platform's courseware_studentmodule table
   have driven home the point that we should try to reduce index sizes
   where possible, and RawContent is likely to grow into a very large
   table in time.
2. Media type designations can sometimes change, and that's much easier
   to do as a data migration if it's in its own lookup table, rather
   than affecting millions of rows in RawContent.

In order to support these changes, I've also added LRU caching for
MediaType lookup... which in turn required a test helper that will clear
out the values between test calls. That's the rationale behind our new
openedx_learning.lib.test_utils.TestCase.
  • Loading branch information
ormsbee authored Dec 5, 2023
1 parent ad71bfa commit b629e49
Show file tree
Hide file tree
Showing 16 changed files with 255 additions and 34 deletions.
2 changes: 1 addition & 1 deletion openedx_learning/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""
Open edX Learning ("Learning Core").
"""
__version__ = "0.3.7"
__version__ = "0.4.0"
2 changes: 1 addition & 1 deletion openedx_learning/core/components/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def content_preview(cvc_obj: ComponentVersionRawContent) -> SafeText:
"""
raw_content_obj = cvc_obj.raw_content

if raw_content_obj.mime_type.startswith("image/"):
if raw_content_obj.media_type.type == "image":
return format_html(
'<img src="{}" style="max-width: 100%;" />',
# TODO: configure with settings value:
Expand Down
1 change: 1 addition & 0 deletions openedx_learning/core/components/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def get_component_version_content(
"""
return ComponentVersionRawContent.objects.select_related(
"raw_content",
"raw_content__media_type",
"component_version",
"component_version__component",
"component_version__component__learning_package",
Expand Down
4 changes: 2 additions & 2 deletions openedx_learning/core/components/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 3.2.19 on 2023-06-15 14:43
# Generated by Django 3.2.23 on 2023-12-04 00:41

import uuid

Expand All @@ -13,7 +13,7 @@ class Migration(migrations.Migration):
initial = True

dependencies = [
('oel_publishing', '0001_initial'),
('oel_publishing', '0002_alter_fk_on_delete'),
('oel_contents', '0001_initial'),
]

Expand Down
8 changes: 4 additions & 4 deletions openedx_learning/core/contents/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ class RawContentAdmin(ReadOnlyModelAdmin):
"hash_digest",
"file_link",
"learning_package",
"mime_type",
"media_type",
"size",
"created",
]
fields = [
"learning_package",
"hash_digest",
"mime_type",
"media_type",
"size",
"created",
"file_link",
Expand All @@ -34,13 +34,13 @@ class RawContentAdmin(ReadOnlyModelAdmin):
readonly_fields = [
"learning_package",
"hash_digest",
"mime_type",
"media_type",
"size",
"created",
"file_link",
"text_preview",
]
list_filter = ("mime_type", "learning_package")
list_filter = ("media_type", "learning_package")
search_fields = ("hash_digest",)

def file_link(self, raw_content):
Expand Down
39 changes: 37 additions & 2 deletions openedx_learning/core/contents/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
from django.core.files.base import ContentFile
from django.db.transaction import atomic

from openedx_learning.lib.cache import lru_cache
from openedx_learning.lib.fields import create_hash_digest

from .models import RawContent, TextContent
from .models import MediaType, RawContent, TextContent


def create_raw_content(
Expand All @@ -28,9 +29,10 @@ def create_raw_content(
Create a new RawContent instance and persist it to storage.
"""
hash_digest = hash_digest or create_hash_digest(data_bytes)

raw_content = RawContent.objects.create(
learning_package_id=learning_package_id,
mime_type=mime_type,
media_type_id=get_media_type_id(mime_type),
hash_digest=hash_digest,
size=len(data_bytes),
created=created,
Expand All @@ -54,6 +56,39 @@ def create_text_from_raw_content(raw_content: RawContent, encoding="utf-8-sig")
)


@lru_cache(maxsize=128)
def get_media_type_id(mime_type: str) -> int:
"""
Return the MediaType.id for the desired mime_type string.
If it is not found in the database, a new entry will be created for it. This
lazy-writing means that MediaType entry IDs will *not* be the same across
different server instances, and apps should not assume that will be the
case. Even if we were to preload a bunch of common ones, we can't anticipate
the different XBlocks that will be installed in different server instances,
each of which will use their own MediaType.
This will typically only be called when create_raw_content is calling it to
lookup the media_type_id it should use for a new RawContent. If you already
have a RawContent instance, it makes much more sense to access its
media_type relation.
"""
if "+" in mime_type:
base, suffix = mime_type.split("+")
else:
base = mime_type
suffix = ""

main_type, sub_type = base.split("/")
mt, _created = MediaType.objects.get_or_create(
type=main_type,
sub_type=sub_type,
suffix=suffix,
)

return mt.id


def get_or_create_raw_content(
learning_package_id: int,
data_bytes: bytes,
Expand Down
24 changes: 18 additions & 6 deletions openedx_learning/core/contents/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 3.2.19 on 2023-06-15 14:43
# Generated by Django 3.2.23 on 2023-12-04 00:41

import django.core.validators
import django.db.models.deletion
Expand All @@ -13,20 +13,29 @@ class Migration(migrations.Migration):
initial = True

dependencies = [
('oel_publishing', '0001_initial'),
('oel_publishing', '0002_alter_fk_on_delete'),
]

operations = [
migrations.CreateModel(
name='MediaType',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
('sub_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
('suffix', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)),
],
),
migrations.CreateModel(
name='RawContent',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('hash_digest', models.CharField(editable=False, max_length=40)),
('mime_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=255)),
('size', models.PositiveBigIntegerField(validators=[django.core.validators.MaxValueValidator(50000000)])),
('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])),
('file', models.FileField(null=True, upload_to='')),
('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')),
('media_type', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='oel_contents.mediatype')),
],
options={
'verbose_name': 'Raw Content',
Expand All @@ -41,10 +50,13 @@ class Migration(migrations.Migration):
('length', models.PositiveIntegerField()),
],
),
# Call out to custom code here to change row format for TextContent
migrations.AddConstraint(
model_name='mediatype',
constraint=models.UniqueConstraint(fields=('type', 'sub_type', 'suffix'), name='oel_contents_uniq_t_st_sfx'),
),
migrations.AddIndex(
model_name='rawcontent',
index=models.Index(fields=['learning_package', 'mime_type'], name='oel_content_idx_lp_mime_type'),
index=models.Index(fields=['learning_package', 'media_type'], name='oel_content_idx_lp_media_type'),
),
migrations.AddIndex(
model_name='rawcontent',
Expand All @@ -56,6 +68,6 @@ class Migration(migrations.Migration):
),
migrations.AddConstraint(
model_name='rawcontent',
constraint=models.UniqueConstraint(fields=('learning_package', 'mime_type', 'hash_digest'), name='oel_content_uniq_lc_mime_type_hash_digest'),
constraint=models.UniqueConstraint(fields=('learning_package', 'media_type', 'hash_digest'), name='oel_content_uniq_lc_media_type_hash_digest'),
),
]
102 changes: 88 additions & 14 deletions openedx_learning/core/contents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
the simplest building blocks to store data with. They need to be composed into
more intelligent data models to be useful.
"""
from functools import cached_property

from django.conf import settings
from django.core.files.storage import default_storage
from django.core.validators import MaxValueValidator
Expand All @@ -18,6 +20,81 @@
from ..publishing.models import LearningPackage


class MediaType(models.Model):
"""
Stores Media types for use by RawContent models.
This is the same as MIME types (the IANA renamed MIME Types to Media Types).
We don't pre-populate this table, so APIs that add RawContent must ensure
that the desired Media Type exists.
Media types are written as {type}/{sub_type}+{suffix}, where suffixes are
seldom used.
* application/json
* text/css
* image/svg+xml
* application/vnd.openedx.xblock.v1.problem+xml
We have this as a separate model (instead of a field on RawContent) because:
1. We can save a lot on storage and indexing for RawContent if we're just
storing foreign key references there, rather than the entire content
string to be indexed. This is especially relevant for our (long) custom
types like "application/vnd.openedx.xblock.v1.problem+xml".
2. These values can occasionally change. For instance, "text/javascript" vs.
"application/javascript". Also, we will be using a fair number of "vnd."
style of custom content types, and we may want the flexibility of
changing that without having to worry about migrating millions of rows of
RawContent.
"""
# We're going to have many foreign key references from RawContent into this
# model, and we don't need to store those as 8-byte BigAutoField, as is the
# default for this app. It's likely that a SmallAutoField would work, but I
# can just barely imagine using more than 32K Media types if we have a bunch
# of custom "vnd." entries, or start tracking suffixes and parameters. Which
# is how we end up at the 4-byte AutoField.
id = models.AutoField(primary_key=True)

# Media types are denoted as {type}/{sub_type}+{suffix}. We currently do not
# support parameters.

# Media type, e.g. "application", "text", "image". Per RFC 4288, this can be
# at most 127 chars long and is case insensitive. In practice, it's almost
# always written in lowercase.
type = case_insensitive_char_field(max_length=127, blank=False, null=False)

# Media sub-type, e.g. "json", "css", "png". Per RFC 4288, this can be at
# most 127 chars long and is case insensitive. In practice, it's almost
# always written in lowercase.
sub_type = case_insensitive_char_field(max_length=127, blank=False, null=False)

# Suffix, usually just "xml" (e.g. "image/svg+xml"). Usually blank. I
# couldn't find an RFC description of the length limit, and 127 is probably
# excessive. But this table should be small enough where it doesn't really
# matter.
suffix = case_insensitive_char_field(max_length=127, blank=True, null=False)

class Meta:
constraints = [
# Make sure all (type + sub_type + suffix) combinations are unique.
models.UniqueConstraint(
fields=[
"type",
"sub_type",
"suffix",
],
name="oel_contents_uniq_t_st_sfx",
),
]

def __str__(self):
base = f"{self.type}/{self.sub_type}"
if self.suffix:
return f"{base}+{self.suffix}"
return base


class RawContent(models.Model): # type: ignore[django-manager-missing]
"""
This is the most basic piece of raw content data, with no version metadata.
Expand Down Expand Up @@ -77,15 +154,8 @@ class RawContent(models.Model): # type: ignore[django-manager-missing]
# openedx.lib.fields module.
hash_digest = hash_field()

# MIME type, such as "text/html", "image/png", etc. Per RFC 4288, MIME type
# and sub-type may each be 127 chars, making a max of 255 (including the "/"
# in between). Also, while MIME types are almost always written in lowercase
# as a matter of convention, by spec they are NOT case sensitive.
#
# DO NOT STORE parameters here, e.g. "charset=". We can make a new field if
# that becomes necessary. If we do decide to store parameters and values
# later, note that those *may be* case sensitive.
mime_type = case_insensitive_char_field(max_length=255, blank=False)
# What is the Media type (a.k.a. MIME type) of this data?
media_type = models.ForeignKey(MediaType, on_delete=models.PROTECT)

# This is the size of the raw data file in bytes. This can be different than
# the character length, since UTF-8 encoding can use anywhere between 1-4
Expand All @@ -107,28 +177,32 @@ class RawContent(models.Model): # type: ignore[django-manager-missing]
storage=settings.OPENEDX_LEARNING.get("STORAGE", default_storage), # type: ignore
)

@cached_property
def mime_type(self):
return str(self.media_type)

class Meta:
constraints = [
# Make sure we don't store duplicates of this raw data within the
# same LearningPackage, unless they're of different MIME types.
models.UniqueConstraint(
fields=[
"learning_package",
"mime_type",
"media_type",
"hash_digest",
],
name="oel_content_uniq_lc_mime_type_hash_digest",
name="oel_content_uniq_lc_media_type_hash_digest",
),
]
indexes = [
# LearningPackage MIME type Index:
# LearningPackage Media type Index:
# * Break down Content counts by type/subtype with in a
# LearningPackage.
# * Find all the Content in a LearningPackage that matches a
# certain MIME type (e.g. "image/png", "application/pdf".
models.Index(
fields=["learning_package", "mime_type"],
name="oel_content_idx_lp_mime_type",
fields=["learning_package", "media_type"],
name="oel_content_idx_lp_media_type",
),
# LearningPackage (reverse) Size Index:
# * Find largest Content in a LearningPackage.
Expand Down
34 changes: 34 additions & 0 deletions openedx_learning/lib/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Test-friendly helpers for caching.
LRU caching can be especially helpful for Learning Core data models because so
much of it is immutable, e.g. Media Type lookups. But while these data models
are immutable within a given site, we also want to be able to track and clear
these caches across test runs. Later on, we may also want to inspect them to
make sure they are not growing overly large.
"""
import functools

# List of functions that have our
_lru_cached_fns = []


def lru_cache(*args, **kwargs):
"""
Thin wrapper over functools.lru_cache that lets us clear all caches later.
"""
def decorator(fn):
wrapped_fn = functools.lru_cache(*args, **kwargs)(fn)
_lru_cached_fns.append(wrapped_fn)
return wrapped_fn
return decorator


def clear_lru_caches():
"""
Clear all LRU caches that use our lru_cache decorator.
Useful for tests.
"""
for fn in _lru_cached_fns:
fn.cache_clear()
10 changes: 10 additions & 0 deletions openedx_learning/lib/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ def hash_field() -> models.CharField:
Use the create_hash_digest function to generate data suitable for this
field.
There are a couple of ways that we could have stored this more efficiently,
but we don't at this time:
1. A BinaryField would be the most space efficient, but Django doesn't
support indexing a BinaryField in a MySQL database.
2. We could make the field case-sensitive and run it through a URL-safe
base64 encoding. But the amount of space this saves vs. the complexity
didn't seem worthwhile, particularly the possibility of case-sensitivity
related bugs.
"""
return models.CharField(
max_length=40,
Expand Down
Loading

0 comments on commit b629e49

Please sign in to comment.