diff --git a/src/promnesia/common.py b/src/promnesia/common.py index 5a54b25c..000ba652 100644 --- a/src/promnesia/common.py +++ b/src/promnesia/common.py @@ -586,3 +586,11 @@ def measure(tag: str='', *, logger, unit: str='ms'): mult = {'s': 1, 'ms': 10**3, 'us': 10**6}[unit] xx = secs * mult logger.debug(f'[{tag}]: {xx:.1f}{unit} elapsed') + + +def is_sqlite_db(x: Path) -> bool: + return x.is_file() and mime(x) in { + 'application/x-sqlite3', + 'application/vnd.sqlite3', + # TODO this mime can also match wal files/journals, not sure + } diff --git a/src/promnesia/sources/browser.py b/src/promnesia/sources/browser.py new file mode 100644 index 00000000..11387f98 --- /dev/null +++ b/src/promnesia/sources/browser.py @@ -0,0 +1,84 @@ +import re +from typing import Optional, Iterator, Any, TYPE_CHECKING +import warnings + +from promnesia.common import Results, Visit, Loc, Second, PathIsh, logger, is_sqlite_db + + +def index(p: Optional[PathIsh]=None) -> Results: + from . import hpi + + if p is None: + from my.browser.all import history + yield from _index_new(history()) + return + + warnings.warn('Passing paths to promnesia.sources.browser is deprecated. You should switch to HPI for that. See https://github.com/seanbreckenridge/browserexport#hpi') + + # even if the used doesn't have HPI config for my.browser set up, + try: + yield from _index_new_with_adhoc_config(path=p) + except Exception as e: + logger.exception(e) + warnings.warn("Setting my.config.browser.export didn't work. You probably need to update HPI.") + else: + return + + logger.warning("Falling back onto legacy promnesia.sources.browser_old") + raise RuntimeError + yield from _index_old(path=p) + + +def _index_old(*, path: PathIsh) -> Results: + from . import browser_old + yield from browser_old.index(path) + + +def _index_new_with_adhoc_config(*, path: PathIsh) -> Results: + ## previously, it was possible to index be called with multiple different db search paths + ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time + ## so we hack cachew path so it's different for each call + from my.core.core_config import config as hpi_core_config + hpi_cache_dir = hpi_core_config.get_cache_dir() + sanitized_path = re.sub(r'\W', '_', str(path)) + cache_override = hpi_cache_dir / sanitized_path + ## + + from my.core.common import classproperty, Paths, get_files + class config: + class core: + cache_dir = cache_override + + class browser: + class export: + @classproperty + def export_path(cls) -> Paths: + return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)]) + + + from my.core.cfg import tmp_config + with tmp_config(modules='my.browser.export|my.core.core_config', config=config): + from my.browser.export import history + yield from _index_new(history()) + + +if TYPE_CHECKING: + from browserexport.merge import Visit as BrowserMergeVisit +else: + BrowserMergeVisit = Any + + +def _index_new(history: Iterator[BrowserMergeVisit]) -> Results: + for v in history: + desc: Optional[str] = None + duration: Optional[Second] = None + metadata = v.metadata + if metadata is not None: + desc = metadata.title + duration = metadata.duration + yield Visit( + url=v.url, + dt=v.dt, + locator=Loc(title=desc or v.url, href=v.url), + duration=duration, + ) diff --git a/src/promnesia/sources/browser_new.py b/src/promnesia/sources/browser_new.py deleted file mode 100644 index 76fd6dd7..00000000 --- a/src/promnesia/sources/browser_new.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional - -from promnesia.common import Results, Visit, Loc, Second - - -def index() -> Results: - from . import hpi - from my.browser.all import history - - for v in history(): - desc: Optional[str] = None - duration: Optional[Second] = None - metadata = v.metadata - if metadata is not None: - desc = metadata.title - duration = metadata.duration - yield Visit( - url=v.url, - dt=v.dt, - locator=Loc(title=desc or v.url, href=v.url), - duration=duration, - ) diff --git a/src/promnesia/sources/browser_old.py b/src/promnesia/sources/browser_old.py index 82454d90..2886aa7c 100644 --- a/src/promnesia/sources/browser_old.py +++ b/src/promnesia/sources/browser_old.py @@ -6,31 +6,21 @@ import pytz -from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime +from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db from .. import config # todo mcachew? from cachew import cachew -logger = get_logger() - def index(p: PathIsh) -> Results: pp = Path(p) assert pp.exists(), pp # just in case of broken symlinks - # is_file check because it also returns dirs - # TODO hmm, not sure what I meant here -- which dirs? behind symlinks? - is_db = lambda x: x.is_file() and mime(x) in { - 'application/x-sqlite3', - 'application/vnd.sqlite3', - # TODO this mime can also match wal files/journals, not sure - } - # todo warn if filtered out too many? # todo wonder how quickly mimes can be computed? # todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc... - dbs = [p for p in sorted(pp.rglob('*')) if is_db(p)] + dbs = [p for p in sorted(pp.rglob('*')) if is_sqlite_db(p)] assert len(dbs) > 0, pp logger.info('processing %d databases', len(dbs)) diff --git a/tox.ini b/tox.ini index ca358dfc..278bb11d 100644 --- a/tox.ini +++ b/tox.ini @@ -72,6 +72,7 @@ commands = hpi module install my.reddit hpi module install my.fbmessenger hpi module install my.google.takeout.parser + hpi module install my.browser.export {envpython} -m mypy --install-types --non-interactive \ -p promnesia.sources \