Skip to content

Commit

Permalink
Merge pull request #7 from RyouMon/dev
Browse files Browse the repository at this point in the history
Pixiv illust filename obeys illust id.
  • Loading branch information
RyouMon authored Feb 25, 2022
2 parents 8b4495f + 52a1ea1 commit 748b794
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 68 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export https_proxy=http://localhost:8080
# Login

```
favors login [-h] {pixiv,yandere,lemon}
favors login [-h] {pixiv,yandere}
```

## Login Pixiv
Expand Down
2 changes: 2 additions & 0 deletions dev_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from favorites_crawler import main
main()
5 changes: 0 additions & 5 deletions src/favorites_crawler/constants/blacklists.py

This file was deleted.

3 changes: 0 additions & 3 deletions src/favorites_crawler/constants/regexes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
# match file id, file ext
PIXIV_ORIGINAL_IMAGE_URL_PATTERN = r'https://i.pximg.net/img-original/img/.+/(.+)\.(.+)'

ILLEGAL_FILENAME_CHARACTERS = r'[#%&{}\<>*?/$!\'":@+`|=?]'
2 changes: 1 addition & 1 deletion src/favorites_crawler/itemloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class PixivIllustItemLoader(ItemLoader):
default_item_class = items.PixivIllustItem
default_output_processor = take_first

original_image_urls_out = identity
image_urls_out = identity
tags_out = filter_pixiv_tags


Expand Down
27 changes: 11 additions & 16 deletions src/favorites_crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,17 @@ def get_folder_subfix(self):


@dataclass
class PixivIllustItem:
"""Pixiv Illust"""
id: int = field(default=None)
title: str = field(default=None)
tags: list = field(default_factory=list)
referer: str = field(default=None)
original_image_urls: list = field(default=None)

def get_filename(self, pk, ext):
"""pk is file_id, ext is file extension"""
tags = ' '.join(map(
lambda s: drop_illegal_characters(s).replace(' ', '_'),
(tag.get('translated_name') or tag.get('name', '') for tag in self.tags)
))
title = drop_illegal_characters(self.title)
return f'{pk} {title} [{tags}].{ext}'
class PixivIllustItem(BaseItem):

def get_folder_prefix(self):
return ''

def get_folder_subfix(self):
return ''

def get_folder_name(self):
return 'Pixiv'



@dataclass
Expand Down
27 changes: 0 additions & 27 deletions src/favorites_crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,12 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import re
import os

from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline
from itemadapter import ItemAdapter

from favorites_crawler.constants.regexes import PIXIV_ORIGINAL_IMAGE_URL_PATTERN


class PixivFilesPipeline(FilesPipeline):

def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
return [
Request(url, headers={'Referer': item_dict['referer']}, meta=item_dict)
for url in item_dict['original_image_urls']
]

def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Download Failed')
return item

def file_path(self, request, response=None, info=None, *, item=None):
match = re.search(PIXIV_ORIGINAL_IMAGE_URL_PATTERN, request.url)
file_id = match.group(1)
file_ext = match.group(2)
filename = item.get_filename(file_id, file_ext)
return os.path.join('Pixiv', filename)


class YandreFilesPipeline(FilesPipeline):

Expand Down
12 changes: 0 additions & 12 deletions src/favorites_crawler/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,11 @@

from itemloaders.processors import TakeFirst, Identity

from favorites_crawler.constants.blacklists import PIXIV_TAG_KEYWORD_BLACKLIST


take_first = TakeFirst()
identity = Identity()


def filter_pixiv_tags(tags):
def filter_tag(tag):
for v in tag.values():
for k in PIXIV_TAG_KEYWORD_BLACKLIST:
if v and k in v:
return False
return True
return [tag for tag in filter(filter_tag, tags)]


def get_nhentai_id(url):
"""Get comic ID from comic url"""
match = re.match(r'https.+g/(\d+)/', url)
Expand Down
6 changes: 3 additions & 3 deletions src/favorites_crawler/spiders/pixiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class PixivSpider(Spider):
custom_settings = {
'USER_AGENT': PIXIV_IOS_USER_AGENT,
'DEFAULT_REQUEST_HEADERS': PIXIV_REQUEST_HEADERS,
'ITEM_PIPELINES': {'favorites_crawler.pipelines.PixivFilesPipeline': 0},
'ITEM_PIPELINES': {'favorites_crawler.pipelines.CollectionFilePipeline': 0},
# Add PixivAuthorizationMiddleware after DefaultHeadersMiddleware
'DOWNLOADER_MIDDLEWARES': {'favorites_crawler.middlewares.PixivAuthorizationMiddleware': 450},
}
Expand Down Expand Up @@ -48,8 +48,8 @@ def parse(self, response, **kwargs):
loader.add_value('id', illust.get('id'))
loader.add_value('title', illust.get('title'))
loader.add_value('tags', illust.get('tags'))
loader.add_value('original_image_urls', illust.get('meta_single_page', {}).get('original_image_url', ()))
loader.add_value('original_image_urls', [
loader.add_value('image_urls', illust.get('meta_single_page', {}).get('original_image_url', ()))
loader.add_value('image_urls', [
d['image_urls']['original'] for d in illust.get('meta_pages', ())
if d.get('image_urls', {}).get('original')
])
Expand Down

0 comments on commit 748b794

Please sign in to comment.