From 4e32f42e21fd0fcc7ed985b840df801d2b706cf0 Mon Sep 17 00:00:00 2001 From: Rowan Seymour Date: Tue, 26 Sep 2023 10:00:24 -0500 Subject: [PATCH 1/3] Fix checking S3 uploads so that we always check size but only check hash for files uploaded as single part --- archives/archives.go | 4 ---- archives/messages.go | 14 +++++++++----- archives/runs.go | 14 +++++++++----- archives/s3.go | 33 ++++++++++++++++++++------------- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/archives/archives.go b/archives/archives.go index 79857b6..76f8dbc 100644 --- a/archives/archives.go +++ b/archives/archives.go @@ -535,10 +535,6 @@ func CreateArchiveFile(ctx context.Context, db *sqlx.DB, archive *Archive, archi return errors.Wrapf(err, "error calculating archive hash") } - if stat.Size() > 5e9 { - return fmt.Errorf("archive too large, must be smaller than 5 gigs, build dailies if possible") - } - archive.ArchiveFile = file.Name() archive.Size = stat.Size() archive.RecordCount = recordCount diff --git a/archives/messages.go b/archives/messages.go index 975933d..4c3f592 100644 --- a/archives/messages.go +++ b/archives/messages.go @@ -129,15 +129,19 @@ func DeleteArchivedMessages(ctx context.Context, config *Config, db *sqlx.DB, s3 }) log.Info("deleting messages") - // first things first, make sure our file is present on S3 - md5, err := GetS3FileETAG(outer, s3Client, archive.URL) + // first things first, make sure our file is correct on S3 + s3Size, s3Hash, err := GetS3FileInfo(outer, s3Client, archive.URL) if err != nil { return err } - // if our etag and archive md5 don't match, that's an error, return - if md5 != archive.Hash { - return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, md5) + if s3Size != archive.Size { + return fmt.Errorf("archive size: %d and s3 size: %d do not match", archive.Size, s3Size) + } + + // if S3 hash is MD5 then check against archive hash + if archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { + return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, s3Hash) } // ok, archive file looks good, let's build up our list of message ids, this may be big but we are int64s so shouldn't be too big diff --git a/archives/runs.go b/archives/runs.go index 87eabfc..304c8da 100644 --- a/archives/runs.go +++ b/archives/runs.go @@ -124,15 +124,19 @@ func DeleteArchivedRuns(ctx context.Context, config *Config, db *sqlx.DB, s3Clie }) log.Info("deleting runs") - // first things first, make sure our file is present on S3 - md5, err := GetS3FileETAG(outer, s3Client, archive.URL) + // first things first, make sure our file is correct on S3 + s3Size, s3Hash, err := GetS3FileInfo(outer, s3Client, archive.URL) if err != nil { return err } - // if our etag and archive md5 don't match, that's an error, return - if md5 != archive.Hash { - return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, md5) + if s3Size != archive.Size { + return fmt.Errorf("archive size: %d and s3 size: %d do not match", archive.Size, s3Size) + } + + // if S3 hash is MD5 then check against archive hash + if archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { + return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, s3Hash) } // ok, archive file looks good, let's build up our list of run ids, this may be big but we are int64s so shouldn't be too big diff --git a/archives/s3.go b/archives/s3.go index cc55d65..5e7c3ca 100644 --- a/archives/s3.go +++ b/archives/s3.go @@ -20,7 +20,13 @@ import ( "github.com/sirupsen/logrus" ) -var s3BucketURL = "https://%s.s3.amazonaws.com%s" +const s3BucketURL = "https://%s.s3.amazonaws.com%s" + +// any file over this needs to be uploaded in chunks +const maxSingleUploadBytes = 5e9 // 5GB + +// size of chunk to use when doing multi-part uploads +const chunkSizeBytes = 1e9 // 1GB // NewS3Client creates a new s3 client from the passed in config, testing it as necessary func NewS3Client(config *Config) (s3iface.S3API, error) { @@ -81,7 +87,7 @@ func UploadToS3(ctx context.Context, s3Client s3iface.S3API, bucket string, path md5 := base64.StdEncoding.EncodeToString(hashBytes) // if this fits into a single part, upload that way - if archive.Size <= 5e9 { + if archive.Size <= maxSingleUploadBytes { params := &s3.PutObjectInput{ Bucket: aws.String(bucket), Body: f, @@ -97,11 +103,11 @@ func UploadToS3(ctx context.Context, s3Client s3iface.S3API, bucket string, path return err } } else { - // this file is bigger than 5 gigs, use an upload manager instead, it will take care of uploading in parts + // this file is bigger than limit, use an upload manager instead, it will take care of uploading in parts uploader := s3manager.NewUploaderWithClient( s3Client, func(u *s3manager.Uploader) { - u.PartSize = 1e9 // 1 gig per part + u.PartSize = chunkSizeBytes }, ) params := &s3manager.UploadInput{ @@ -129,17 +135,17 @@ func withAcceptEncoding(e string) request.Option { } } -// GetS3FileETAG returns the ETAG hash for the passed in file -func GetS3FileETAG(ctx context.Context, s3Client s3iface.S3API, fileURL string) (string, error) { +// GetS3FileInfo returns the ETAG hash for the passed in file +func GetS3FileInfo(ctx context.Context, s3Client s3iface.S3API, fileURL string) (int64, string, error) { u, err := url.Parse(fileURL) if err != nil { - return "", err + return 0, "", err } bucket := strings.Split(u.Host, ".")[0] path := u.Path - output, err := s3Client.HeadObjectWithContext( + head, err := s3Client.HeadObjectWithContext( ctx, &s3.HeadObjectInput{ Bucket: aws.String(bucket), @@ -148,16 +154,17 @@ func GetS3FileETAG(ctx context.Context, s3Client s3iface.S3API, fileURL string) ) if err != nil { - return "", err + return 0, "", err } - if output.ETag == nil { - return "", fmt.Errorf("no ETAG for object") + if head.ContentLength == nil || head.ETag == nil { + return 0, "", fmt.Errorf("no size or ETag returned for S3 object") } // etag is quoted, remove them - etag := strings.Trim(*output.ETag, `"`) - return etag, nil + etag := strings.Trim(*head.ETag, `"`) + + return *head.ContentLength, etag, nil } // GetS3File return an io.ReadCloser for the passed in bucket and path From 8165728a2babf2e21d263e98bae5651a6ec7d972 Mon Sep 17 00:00:00 2001 From: Rowan Seymour Date: Tue, 26 Sep 2023 11:09:34 -0500 Subject: [PATCH 2/3] Allow disabling of hash checking --- archives/config.go | 14 ++++++++------ archives/messages.go | 2 +- archives/runs.go | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/archives/config.go b/archives/config.go index ad001d2..596b56e 100644 --- a/archives/config.go +++ b/archives/config.go @@ -17,9 +17,10 @@ type Config struct { AWSAccessKeyID string `help:"the access key id to use when authenticating S3"` AWSSecretAccessKey string `help:"the secret access key id to use when authenticating S3"` - TempDir string `help:"directory where temporary archive files are written"` - KeepFiles bool `help:"whether we should keep local archive files after upload (default false)"` - UploadToS3 bool `help:"whether we should upload archive to S3"` + TempDir string `help:"directory where temporary archive files are written"` + KeepFiles bool `help:"whether we should keep local archive files after upload (default false)"` + UploadToS3 bool `help:"whether we should upload archive to S3"` + CheckS3Hashes bool `help:"whether to check S3 hashes of uploaded archives before deleting records"` ArchiveMessages bool `help:"whether we should archive messages"` ArchiveRuns bool `help:"whether we should archive runs"` @@ -50,9 +51,10 @@ func NewDefaultConfig() *Config { AWSAccessKeyID: "", AWSSecretAccessKey: "", - TempDir: "/tmp", - KeepFiles: false, - UploadToS3: true, + TempDir: "/tmp", + KeepFiles: false, + UploadToS3: true, + CheckS3Hashes: true, ArchiveMessages: true, ArchiveRuns: true, diff --git a/archives/messages.go b/archives/messages.go index 4c3f592..d45b711 100644 --- a/archives/messages.go +++ b/archives/messages.go @@ -140,7 +140,7 @@ func DeleteArchivedMessages(ctx context.Context, config *Config, db *sqlx.DB, s3 } // if S3 hash is MD5 then check against archive hash - if archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { + if config.CheckS3Hashes && archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, s3Hash) } diff --git a/archives/runs.go b/archives/runs.go index 304c8da..9a35957 100644 --- a/archives/runs.go +++ b/archives/runs.go @@ -135,7 +135,7 @@ func DeleteArchivedRuns(ctx context.Context, config *Config, db *sqlx.DB, s3Clie } // if S3 hash is MD5 then check against archive hash - if archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { + if config.CheckS3Hashes && archive.Size <= maxSingleUploadBytes && s3Hash != archive.Hash { return fmt.Errorf("archive md5: %s and s3 etag: %s do not match", archive.Hash, s3Hash) } From 7bbf668d9485910c7b88748039dac52f4659ec1e Mon Sep 17 00:00:00 2001 From: Rowan Seymour Date: Tue, 26 Sep 2023 11:47:25 -0500 Subject: [PATCH 3/3] Update README --- README.md | 64 ++++++------------------------------------------------- 1 file changed, 7 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index d984471..a9339e8 100644 --- a/README.md +++ b/README.md @@ -26,15 +26,14 @@ We recommend running it with no changes to the configuration and no parameters, environment variables to configure it. You can use `% rp-archiver --help` to see a list of the environment variables and parameters and for more details on each option. -### RapidPro - -For use with RapidPro, you will want to configure these settings: +For use with RapidPro/TextIt, you will need to configure these settings: * `ARCHIVER_DB`: URL describing how to connect to the database (default "postgres://temba:temba@localhost/temba?sslmode=disable") * `ARCHIVER_TEMP_DIR`: The directory that temporary archives will be written before upload (default "/tmp") * `ARCHIVER_DELETE`: Whether to delete messages and runs after they are archived, we recommend setting this to true for large installations (default false) -For writing of archives, Archiver needs access to an S3 bucket, you can configure access to your bucket via: +For writing of archives, Archiver needs access to a storage bucket on an S3 compatible service. For AWS we recommend that +you choose SSE-S3 encryption as this is the only type that supports validation of upload ETags. * `ARCHIVER_S3_REGION`: The region for your S3 bucket (ex: `ew-west-1`) * `ARCHIVER_S3_BUCKET`: The name of your S3 bucket (ex: `dl-archiver-test"`) @@ -42,62 +41,13 @@ For writing of archives, Archiver needs access to an S3 bucket, you can configur * `ARCHIVER_AWS_ACCESS_KEY_ID`: The AWS access key id used to authenticate to AWS * `ARCHIVER_AWS_SECRET_ACCESS_KEY` The AWS secret access key used to authenticate to AWS -Recommended settings for error reporting: - - * `ARCHIVER_SENTRY_DSN`: The DSN to use when logging errors to Sentry +If using a different encryption type or service that produces non-MD5 ETags: -### Reference + * `CHECK_S3_HASHES`: can be set to `FALSE` to disable checking of upload hashes. -These are the configuration options that can be provided as parameters or environment variables. If using environment -varibles, convert to uppercase, replace dashes with underscores and prefix the name with `ARCHIVER_`, e.g. `-log-level` -becomes `ARCHIVER_LOG_LEVEL`. +Recommended settings for error reporting: -``` - -archive-messages - whether we should archive messages (default true) - -archive-runs - whether we should archive runs (default true) - -aws-access-key-id string - the access key id to use when authenticating S3 (default none) - -aws-secret-access-key string - the secret access key id to use when authenticating S3 (default none) - -db string - the connection string for our database (default "postgres://localhost/archiver_test?sslmode=disable") - -debug-conf - print where config values are coming from - -delete - whether to delete messages and runs from the db after archival (default false) - -help - print usage information - -keep-files - whether we should keep local archive files after upload (default false) - -librato-username - the Librato username for metrics reporting - -librato-token - the Librato token for metrics reporting - -log-level string - the log level, one of error, warn, info, debug (default "info") - -once - run archving immediately and then exit - -retention-period int - the number of days to keep before archiving (default 90) - -s3-bucket string - the S3 bucket we will write archives to (default "dl-archiver-test") - -s3-disable-ssl - whether we disable SSL when accessing S3. Should always be set to False unless you're hosting an S3 compatible service within a secure internal network - -s3-endpoint string - the S3 endpoint we will write archives to (default "https://s3.amazonaws.com") - -s3-force-path-style - whether we force S3 path style. Should generally need to default to False unless you're hosting an S3 compatible service - -s3-region string - the S3 region we will write archives to (default "us-east-1") - -sentry-dsn string - the sentry configuration to log errors to, if any - -temp-dir string - directory where temporary archive files are written (default "/tmp") - -upload-to-s3 - whether we should upload archive to S3 (default true) -``` + * `ARCHIVER_SENTRY_DSN`: The DSN to use when logging errors to Sentry ## Development