Skip to content

Commit

Permalink
Add datapackage.json generation in WACZ (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 28, 2024
1 parent ba6dcaa commit 69b2ad5
Show file tree
Hide file tree
Showing 9 changed files with 253 additions and 20 deletions.
13 changes: 13 additions & 0 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@ This is the output path of the WACZ file. Multiple variables can be added that a

Supported variables: `spider`, `year`, `month`, `day` and `timestamp`.

### `SW_WACZ_TITLE`

This setting defines the title of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured.

### `SW_WACZ_DESCRIPTION`

This setting defines the description of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured. Defaults to:

> This is the web archive generated by a scrapy-webarchive extension for the
> <spider_name> spider. It is mainly for scraping purposes as it does not contain
> any js/css data. Though it can be replayed as bare HTML if the site does not depend on
> JavaScript.
## Downloader middleware and spider middleware

### `SW_WACZ_SOURCE_URI`
Expand Down
1 change: 1 addition & 0 deletions scrapy_webarchive/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.0.1.dev2"
13 changes: 10 additions & 3 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from twisted.internet.defer import Deferred
from typing_extensions import Any, Dict, Protocol, Self, Type, Union, cast

from scrapy_webarchive.utils import get_scheme_from_uri, get_warc_date
from scrapy_webarchive.utils import WARC_DT_FORMAT, get_formatted_dt_string, get_scheme_from_uri
from scrapy_webarchive.wacz import WaczFileCreator
from scrapy_webarchive.warc import WarcFileWriter

Expand Down Expand Up @@ -112,7 +112,7 @@ def spider_opened(self) -> None:
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

def response_received(self, response: Response, request: Request, spider: Spider) -> None:
request.meta["WARC-Date"] = get_warc_date()
request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT)

# Write response WARC record
record = self.writer.write_response(response, request)
Expand All @@ -127,7 +127,14 @@ def response_received(self, response: Response, request: Request, spider: Spider
self.stats.inc_value("webarchive/exporter/request_written", spider=spider)

def spider_closed(self, spider: Spider) -> None:
WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create()
wacz_creator = WaczFileCreator(
store=self.store,
warc_fname=self.writer.warc_fname,
collection_name=spider.name,
title=self.settings["SW_WACZ_TITLE"],
description=self.settings["SW_WACZ_DESCRIPTION"],
)
wacz_creator.create()


def get_archive_uri_template_dt_variables() -> dict:
Expand Down
31 changes: 25 additions & 6 deletions scrapy_webarchive/utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from __future__ import annotations

import hashlib
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import IO, Tuple
from urllib.parse import urlparse, urlunparse

from scrapy.settings import Settings

WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
BUFF_SIZE = 1024 * 64

logger = logging.getLogger(__name__)

def get_current_timestamp() -> str:
return datetime.now(timezone.utc).strftime(TIMESTAMP_DT_FORMAT)


def get_warc_date() -> str:
return datetime.now(timezone.utc).strftime(WARC_DT_FORMAT)
def get_formatted_dt_string(format: str) -> str:
return datetime.now(timezone.utc).strftime(format)


def header_lines_to_dict(lines):
Expand Down Expand Up @@ -78,3 +79,21 @@ def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
return urlunparse(updated_uri)

return wacz_uri


def hash_stream(hash_type: str, stream: IO) -> Tuple[int, str]:
"""Hashes the stream with given hash_type hasher."""

# At this moment the `hash_type` (or algorithm) that we pass will always be sha256 as it is hardcoded.
# This check is implemented in case any other algorithms will be made available in the future.
if hash_type not in hashlib.algorithms_guaranteed:
raise ValueError(f"Unsupported hash type: {hash_type}")

hasher = hashlib.new(hash_type)

size = 0
for chunk in iter(lambda: stream.read(BUFF_SIZE), b""):
size += len(chunk)
hasher.update(chunk)

return size, f"{hash_type}:{hasher.hexdigest()}"
94 changes: 91 additions & 3 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,52 @@

import gzip
import io
import json
import os
import zipfile
from collections import defaultdict
from functools import partial
from typing import Any

from scrapy import __version__ as scrapy_version
from scrapy.settings import Settings
from smart_open import open as smart_open
from typing_extensions import IO, TYPE_CHECKING, Dict, Generator, List, Union
from warc.warc import WARCRecord

from scrapy_webarchive import __version__ as scrapy_webarchive_version
from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
from scrapy_webarchive.utils import (
TIMESTAMP_DT_FORMAT,
WARC_DT_FORMAT,
add_ftp_credentials,
get_current_timestamp,
get_formatted_dt_string,
get_gcs_client,
get_s3_client,
get_scheme_from_uri,
hash_stream,
)
from scrapy_webarchive.warc import WARCReader

if TYPE_CHECKING:
from scrapy_webarchive.extensions import FilesStoreProtocol


WACZ_VERSION = "1.1.1"

class WaczFileCreator:
"""Handles creating WACZ archives."""

def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, cdxj_fname: str = "index.cdxj") -> None:
hash_type = "sha256"
datapackage_fname = "datapackage.json"

def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None:
self.store = store
self.warc_fname = warc_fname
self.cdxj_fname = cdxj_fname
self.collection_name = collection_name
self._title = title
self._description = description

def create(self) -> None:
"""Create the WACZ file from the WARC and CDXJ index and save it in the configured store."""
Expand All @@ -59,6 +73,7 @@ def create_wacz_zip(self) -> io.BytesIO:
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
self.write_to_zip(zip_file, self.cdxj_fname, "indexes/")
self.write_to_zip(zip_file, self.warc_fname, "archive/")
self.write_datapackage(zip_file)

return zip_buffer

Expand All @@ -77,7 +92,80 @@ def cleanup_files(self, *files: str) -> None:
def get_wacz_fname(self) -> str:
"""Generate WACZ filename based on the WARC filename."""

return f"{self.collection_name}-{get_current_timestamp()}.wacz"
return f"{self.collection_name}-{get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT)}.wacz"

def write_datapackage(self, zip_file: zipfile.ZipFile) -> None:
"""Main function to create and write the datapackage.json."""

package_dict = self.create_package_dict()

with zip_file.open("archive/" + self.warc_fname) as warc_fh:
package_dict = self.update_package_metadata_from_warc(warc_fh, package_dict)

package_dict["resources"] = self.collect_resources(zip_file)

zip_file.writestr(self.datapackage_fname, json.dumps(package_dict, indent=2))

def create_package_dict(self) -> Dict[str, Any]:
"""Creates the initial package dictionary."""

dt_string = get_formatted_dt_string(format=WARC_DT_FORMAT)
return {
"profile": "data-package",
"title": self.title,
"description": self.description,
"created": dt_string,
"modified": dt_string,
"wacz_version": WACZ_VERSION,
"software": f"scrapy-webarchive/{scrapy_webarchive_version}, Scrapy/{scrapy_version}",
}

def update_package_metadata_from_warc(self, warc_fh: IO, package_dict: Dict[str, Any]) -> Dict[str, Any]:
"""Updates the package dictionary with metadata from the WARC records."""

warc_reader = WARCReader(gzip.open(warc_fh)) if self.warc_fname.endswith(".gz") else WARCReader(warc_fh)

while True:
warc_record = warc_reader.read_record()
if warc_record is None:
break

if warc_record.type == "request":
package_dict.update({
"mainPageUrl": warc_record.url,
"mainPageDate": warc_record.date,
})
break

return package_dict

def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]:
"""Collects resource information from the zip file."""

resources = []

for zip_entry in zip_file.infolist():
with zip_file.open(zip_entry, "r") as stream:
size, hash_ = hash_stream(self.hash_type, stream)

resources.append({
"name": os.path.basename(zip_entry.filename).lower(),
"path": zip_entry.filename,
"hash": hash_,
"bytes": size,
})

return resources

@property
def title(self):
return self._title or self.collection_name

@property
def description(self):
return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \
f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \
"any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript."


class WaczFile:
Expand Down
6 changes: 4 additions & 2 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from scrapy_webarchive.cdxj import CdxjRecord
from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.utils import get_current_timestamp, header_lines_to_dict
from scrapy_webarchive.utils import TIMESTAMP_DT_FORMAT, get_formatted_dt_string, header_lines_to_dict


def generate_warc_fname(prefix: str) -> str:
Expand All @@ -28,10 +28,12 @@ def generate_warc_fname(prefix: str) -> str:
{prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz
"""

timestamp = get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT)
crawlhost = socket.gethostname().split(".")[0]
# As of now we only generate one WARC file. Add serial in here to adhere to the warc specification.
serial = '00000'
return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"

return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz"


class WARCReader(BaseWARCReader):
Expand Down
17 changes: 17 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pytest


@pytest.fixture
def warc_example():
return b"\
WARC/1.0\r\n\
Content-Length: 10\r\n\
WARC-Date: 2024-02-10T16:15:52Z\r\n\
Content-Type: application/http; msgtype=request\r\n\
WARC-Type: request\r\n\
WARC-Record-ID: <urn:uuid:80fb9262-5402-11e1-8206-545200690126>\r\n\
WARC-Target-URI: http://example.com/\r\n\
\r\n\
Helloworld\
\r\n\r\n\
"
53 changes: 53 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import hashlib
import io

import pytest

from scrapy_webarchive.utils import BUFF_SIZE, hash_stream


def test_hash_stream_with_empty_stream():
# Test with an empty stream
data = b""
stream = io.BytesIO(data)
size, result = hash_stream("sha256", stream)

assert size == 0
assert result == f"sha256:{hashlib.sha256(data).hexdigest()}"

def test_hash_stream_with_md5_algorithm():
data = b"Hello world"
expected_hash = hashlib.md5(data).hexdigest()

stream = io.BytesIO(data)
size, result = hash_stream("md5", stream)

assert size == len(data)
assert result == f"md5:{expected_hash}"

def test_hash_stream_with_sha256_algorithm():
data = b"Hello world"
expected_hash = hashlib.sha256(data).hexdigest()

stream = io.BytesIO(data)
size, result = hash_stream("sha256", stream)

assert size == len(data)
assert result == f"sha256:{expected_hash}"

def test_hash_stream_with_unsupported_hash_type():
data = b"Hello world"
stream = io.BytesIO(data)

with pytest.raises(ValueError):
hash_stream("unsupported_hash", stream)

def test_hash_stream_with_large_stream():
data = b"a" * (2 * BUFF_SIZE) # Twice the buffer size
expected_hash = hashlib.sha256(data).hexdigest()

stream = io.BytesIO(data)
size, result = hash_stream("sha256", stream)

assert size == len(data)
assert result == f"sha256:{expected_hash}"
Loading

0 comments on commit 69b2ad5

Please sign in to comment.