-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrab-search.py
123 lines (93 loc) · 5.96 KB
/
grab-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/python3
import requests
import mimetypes
import sys
from pathlib import Path
from datetime import datetime
import time
import socket
from util import *
from headers import *
# SYNPOSIS:
# To download 100 (or fewer, if there aren't enough) artworks of the search term "game of thrones", call
# python3 grab-search.py "game of thrones" 100
# If max-projects isn't specified, it will fetch them all (beware! i really mean ALL! At this time, this would be over 12000 projects for our game of thrones example).
# 2 minute timeout in case something gets stuck.
socket.setdefaulttimeout(120)
search_terms = str.lower(sys.argv[1])
search_terms_filename = "search_" + slugify(search_terms)
max_projects = sys.maxsize
# Is max-posts specified?
if len(sys.argv) >= 3:
max_projects = int(sys.argv[2])
# Create artist directory if it doesn't exist
artist_directory = "./downloads/" + search_terms_filename + "/"
Path(artist_directory).mkdir(parents=True, exist_ok=True)
# Create directory for already saved posts, and generate filename
Path("./already_saved/").mkdir(parents=True, exist_ok=True)
# Create directory for logging, and generate filename
Path("./logs/").mkdir(parents=True, exist_ok=True)
if max_projects == sys.maxsize:
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: ALL OF THEM!", "okndl", search_terms_filename)
else:
logMsg(f"Fetching search results for '{search_terms}'... Max projects to fetch: {max_projects}", "okndl", search_terms_filename)
# Request project info for artist
lastPageReached = False
pageCounter = 1
projectCounter = 0
try:
while not lastPageReached:
logMsg(f"Fetching search result page #{pageCounter} for '{search_terms}'...", "okndl", search_terms_filename)
projects_data = requests.get(f"https://www.artstation.com/api/v2/search/projects.json?page={pageCounter}&per_page=50&sorting=relevance&query={search_terms.replace(' ', '+')}", headers=project_fetch_headers)
projects = projects_data.json()["data"]
result_size = projects_data.json()["total_count"]
page_num_projects = len(projects)
lastPageReached = page_num_projects < 50 # Each full page contains 50 projects. If it has less than 50, it is the last page
if not lastPageReached:
pageCounter = pageCounter + 1
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page...", "okndl", search_terms_filename)
else:
logMsg(f"Found {result_size} projects total (all pages). Of this {page_num_projects} on this page... This is the last page!", "okndl", search_terms_filename)
# For each project in all of the artists projects
for project in projects:
if projectCounter >= max_projects:
logMsg(f"Reached project download limit of {max_projects}. Stopping...", "okndl", search_terms_filename)
exit(0)
project_name = project["title"]
project_hash_id = project["hash_id"]
project_artist_name = project["user"]["username"]
project_artist_name_fullname = project["user"]["full_name"]
logMsg(f"Found project '{project_name}' of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) with project id {project_hash_id}. Fetching more info about it...", "okndl", search_terms_filename)
# Have we already downloaded this post?
if not isPostAlreadySaved(project_hash_id, search_terms_filename):
# Fetch information about the project
project_info = requests.get(f"https://www.artstation.com/projects/{project_hash_id}.json", headers=project_fetch_headers)
assets = project_info.json()["assets"]
# For each asset in the project (might be multiple images)
for asset in assets:
asset_type = asset["asset_type"]
# If the asset is an image
if asset_type == "image":
asset_image_url = asset["image_url"]
asset_position = asset["position"]
# Generate a download filename
filename = artist_directory + slugify(project_artist_name) + "_" + slugify(project_name[:60] + "_" + project_hash_id + "_" + str(asset_position)) + "." + extensionFromUrl(asset_image_url)
logMsg(f"Found image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Downloading to '{filename}'...", "okdl", search_terms_filename)
# Download it
downloadMedia(asset_image_url, filename)
else:
logMsg(f"Found non-image-asset for project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) at position {asset_position}. Skipping...", "okdl", search_terms_filename)
# After downloading all assets, mark the project as downloaded.
markPostAsSaved(project_hash_id, search_terms_filename)
# Project is already downloaded
else:
logMsg(f"Skipping project '{project_name}' [{project_hash_id}] of artist '{project_artist_name_fullname}' (user-id=[{project_artist_name}]) because it is already downloaded.", "okndl", search_terms_filename)
projectCounter = projectCounter + 1
logMsg(f"Finished all search result pages of '{search_terms}'... Total pages scanned: {pageCounter}", "okndl", search_terms_filename)
except socket.timeout as exc:
logMsg("Socket timeout of two minutes reached! We'll get 'em next time, boys!", "err", search_terms_filename)
except SystemExit:
# That's... why i'm here
exit(0)
except BaseException as exc:
logMsg("Failed for some reason!: " + repr(exc), "err", search_terms_filename)