Skip to content

Commit

Permalink
fix: improve logging, add extra template
Browse files Browse the repository at this point in the history
  • Loading branch information
xor22h committed Apr 30, 2024
1 parent a1bcf48 commit 5bd0720
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 26 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ FROM debian:bookworm
RUN apt update && apt install -y libtesseract5
COPY --from=build /usr/bin/rok-server /usr/bin/rok-server
EXPOSE 8080
ENV GOMEMLIMIT=100MiB
ENTRYPOINT [ "/usr/bin/rok-server" ]
4 changes: 3 additions & 1 deletion cmd/rok-scanner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ package main

import (
"fmt"
"github.com/rokmonster/ocr/internal/pkg/rokocr/tesseractutils"
"os"
"strings"
"time"

"github.com/rokmonster/ocr/internal/pkg/rokocr/tesseractutils"

"github.com/olekukonko/tablewriter"
config "github.com/rokmonster/ocr/internal/pkg/config/scannerconfig"
schema "github.com/rokmonster/ocr/internal/pkg/ocrschema"
Expand Down Expand Up @@ -50,6 +51,7 @@ func writeCSV(data []schema.OCRResult, template schema.OCRTemplate) {
}

func main() {

rokocr.Prepare(flags.CommonConfiguration)
rokocr.DownloadTesseractData(flags.CommonConfiguration)
rokocr.PreloadTemplates(flags.CommonConfiguration)
Expand Down
4 changes: 2 additions & 2 deletions docs/install/golang_mac.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ curl -LO https://go.dev/dl/go1.19.2.darwin-amd64.pkg && open ./go1.19.2.darwin-a
# install tesseract & opencv
brew install tesseract opencv
# You might need these
# export CPATH="/opt/homebrew/include"
# export LIBRARY_PATH="/opt/homebrew/lib"
export CPATH="/opt/homebrew/include"
export LIBRARY_PATH="/opt/homebrew/lib"
```

### PLEASE CHECK FOR ERRORS!!!
Expand Down
3 changes: 3 additions & 0 deletions internal/pkg/ocrschema/result-schema.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package ocrschema

import "time"

type OCRResult struct {
Filename string `json:"filename"`
Data map[string]interface{} `json:"data"`
Took time.Duration `json:"duration"`
}
8 changes: 6 additions & 2 deletions internal/pkg/rokocr/tesseractutils/parser.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package tesseractutils

import (
imgutils2 "github.com/rokmonster/ocr/internal/pkg/utils/imgutils"
"github.com/rokmonster/ocr/internal/pkg/utils/stringutils"
"image"
"os"
"path/filepath"
"time"

imgutils2 "github.com/rokmonster/ocr/internal/pkg/utils/imgutils"
"github.com/rokmonster/ocr/internal/pkg/utils/stringutils"

log "github.com/sirupsen/logrus"

Expand All @@ -14,6 +16,7 @@ import (

func ParseImage(name string, img image.Image, template schema.OCRTemplate, tmpdir, tessdata string) schema.OCRResult {
log.Debugf("[%s] Processing with template: %s", filepath.Base(name), template.Title)
start := time.Now()

results := make(map[string]interface{})

Expand All @@ -35,5 +38,6 @@ func ParseImage(name string, img image.Image, template schema.OCRTemplate, tmpdi
return schema.OCRResult{
Filename: filepath.Base(name),
Data: results,
Took: time.Since(start),
}
}
18 changes: 2 additions & 16 deletions internal/pkg/rokocr/tesseractutils/recognition.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"fmt"
"os"
"path/filepath"
"time"

"github.com/rokmonster/ocr/internal/pkg/utils/fileutils"
"github.com/rokmonster/ocr/internal/pkg/utils/imgutils"
Expand All @@ -13,17 +12,6 @@ import (
schema "github.com/rokmonster/ocr/internal/pkg/ocrschema"
)

// func processSingleFile(index, total int, f string) *schema.OCRResult {
// start := time.Now()
// result, err := ParseSingleFile(f, tessData, template, force)
// if err != nil {
// logrus.Printf("[%04d/%04d] %v - %v", index, total, filepath.Base(f), err)
// return
// }
// logrus.Printf("[%04d/%04d] %v Took: %v ms", index, total, filepath.Base(f), time.Since(start).Milliseconds())
// return result
// }

func RunRecognitionChan(mediaDir, tessData string, template schema.OCRTemplate, force bool) <-chan schema.OCRResult {

out := make(chan schema.OCRResult)
Expand All @@ -33,13 +21,11 @@ func RunRecognitionChan(mediaDir, tessData string, template schema.OCRTemplate,
total := len(files)

for index, f := range files {
start := time.Now()
result, err := ParseSingleFile(f, tessData, template, force)
if err != nil {
logrus.Printf("[%04d/%04d] %v - %v", index, total, filepath.Base(f), err)
logrus.Errorf("[%04d/%04d] %v - %v", index, total, filepath.Base(f), err)
continue
}
logrus.Printf("[%04d/%04d] %v Took: %v ms", index, total, filepath.Base(f), time.Since(start).Milliseconds())
out <- *result
}
close(out)
Expand Down Expand Up @@ -69,5 +55,5 @@ func ParseSingleFile(f, tessData string, template schema.OCRTemplate, force bool
return &result, nil
}

return nil, fmt.Errorf("image doesn't match the template")
return nil, fmt.Errorf("image doesn't match the template: Template: %s @ %s", template.Title, template.Version)
}
11 changes: 6 additions & 5 deletions internal/pkg/www/jobs_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,19 @@ import (
"encoding/binary"
"encoding/json"
"fmt"
"github.com/rokmonster/ocr/internal/pkg/www/middlewares"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/rokmonster/ocr/internal/pkg/rokocr/tesseractutils"
"github.com/rokmonster/ocr/internal/pkg/utils/fileutils"

"github.com/gin-gonic/gin"
"github.com/rokmonster/ocr/internal/pkg/ocrschema"
"github.com/rokmonster/ocr/internal/pkg/rokocr"
"github.com/rokmonster/ocr/internal/pkg/rokocr/tesseractutils"
"github.com/rokmonster/ocr/internal/pkg/utils/fileutils"
"github.com/rokmonster/ocr/internal/pkg/www/middlewares"
log "github.com/sirupsen/logrus"
bolt "go.etcd.io/bbolt"
)
Expand Down Expand Up @@ -218,11 +217,13 @@ func (controller *JobsController) StartJobByID(c *gin.Context) {
if len(templates) > 0 {
log.Debugf("Loaded %v templates", len(templates))
template := ocrschema.FindTemplate(mediaDir, templates)
log.Infof("[Job: %04d] Picked template: %s by %s", job.ID, template.Title, template.Author)
_ = controller.updateJobTemplate(job.ID, template)

var data []ocrschema.OCRResult
for elem := range tesseractutils.RunRecognitionChan(mediaDir, controller.tessdataDir, template, false) {
for elem := range tesseractutils.RunRecognitionChan(mediaDir, controller.tessdataDir, template, true) {
data = append(data, elem)
log.Printf("[Job: %04d][%04d/%04d] %v Took: %v ms", job.ID, index, fileCount, elem.Filename, elem.Took.Milliseconds())
index = index + 1
_ = controller.updateJobStatus(job.ID, fmt.Sprintf("Processing: %v/%v", index, fileCount))
_ = controller.updateJobResults(job.ID, data)
Expand Down
Loading

0 comments on commit 5bd0720

Please sign in to comment.