diff --git a/docs/main/_images/RReLU.png b/docs/main/_images/RReLU.png index 4ef2552b9950..a0a467f023dc 100644 Binary files a/docs/main/_images/RReLU.png and b/docs/main/_images/RReLU.png differ diff --git a/docs/main/_modules/torch.html b/docs/main/_modules/torch.html index 21000b19299f..6770913807cb 100644 --- a/docs/main/_modules/torch.html +++ b/docs/main/_modules/torch.html @@ -1982,6 +1982,12 @@
return compile_fx(model_, inputs_, config_patches=self.config)
+ def reset(self):
+ from torch._inductor import config
+ if "triton.cudagraphs" in self.config or config.triton.cudagraphs:
+ if self.config.get("triton.cudagraphs", True):
+ from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+ reset_cudagraph_trees()
[docs]def compile(model: Optional[Callable] = None, *,
fullgraph: builtins.bool = False,
diff --git a/docs/main/_modules/torch/_dynamo.html b/docs/main/_modules/torch/_dynamo.html
index df8f7254129e..de29d1345f79 100644
--- a/docs/main/_modules/torch/_dynamo.html
+++ b/docs/main/_modules/torch/_dynamo.html
@@ -506,6 +506,8 @@ Source code for torch._dynamo
orig_code_map.clear()
guard_failures.clear()
resume_execution.ContinueExecutionCache.cache.clear()
+ if hasattr(eval_frame.most_recent_backend, "reset"):
+ eval_frame.most_recent_backend.reset()
eval_frame.most_recent_backend = None
compilation_metrics.clear()
reset_frame_count()
diff --git a/docs/main/_modules/torch/_dynamo/eval_frame.html b/docs/main/_modules/torch/_dynamo/eval_frame.html
index a91c295fa105..17e0fb61d93a 100644
--- a/docs/main/_modules/torch/_dynamo/eval_frame.html
+++ b/docs/main/_modules/torch/_dynamo/eval_frame.html
@@ -510,7 +510,6 @@ Source code for torch._dynamo.eval_frame
log = logging.getLogger(__name__)
from torch._dispatch.python import enable_python_dispatcher
-from torch._subclasses.fake_tensor import FakeTensor
from torch.fx.experimental import proxy_tensor
always_optimize_code_objects = utils.ExactWeakKeyDictionary()
@@ -901,7 +900,7 @@ Source code for torch._dynamo.eval_frame
elif sys.version_info >= (3, 11):
warnings.warn(
"torch.compile support of Python 3.11 is experimental. "
- "Program may generate incorrect results or segfault."
+ "Program may segfault."
)
@@ -1154,7 +1153,6 @@ Source code for torch._dynamo.eval_frame
graph = None
out_guards = None
graph_captured_input = None
- example_fake_inputs = []
graph_captured_result: Optional[Tuple[torch.Tensor, ...]] = None
def produce_matching(source_args, candidate_args):
@@ -1191,8 +1189,11 @@ Source code for torch._dynamo.eval_frame
assert out_guards is None, "whole graph export entails exactly one guard export"
out_guards = guards
+ fake_mode = None
+ example_inputs = []
+
def dynamo_normalization_capturing_compiler(
- gm: torch.fx.GraphModule, example_inputs
+ gm: torch.fx.GraphModule, inner_example_inputs
):
nonlocal graph
assert (
@@ -1200,8 +1201,9 @@ Source code for torch._dynamo.eval_frame
), "Tried to emit a second graph during export. Tracing through 'f' must produce a single graph."
graph = gm
- nonlocal example_fake_inputs
- example_fake_inputs = example_inputs
+ nonlocal fake_mode, example_inputs
+ fake_mode = _guards.detect_fake_mode(inner_example_inputs)
+ example_inputs = inner_example_inputs
def result_capturing_wrapper(*graph_inputs):
nonlocal graph_captured_result
@@ -1238,6 +1240,7 @@ Source code for torch._dynamo.eval_frame
graph is not None
), "Failed to produce a graph during tracing. Tracing through 'f' must produce a single graph."
assert out_guards is not None, "Failed to produce guards during tracing"
+ assert fake_mode is not None
matched_input_elements_positions = produce_matching(flat_args, graph_captured_input)
@@ -1283,19 +1286,16 @@ Source code for torch._dynamo.eval_frame
r.node.meta["val"] = self.current_node.meta["val"]
return r
+ # NB: This is mostly hitting the cache; Dynamo already converted these
+ example_fake_inputs = [fake_mode.from_tensor(t) for t in example_inputs]
+
if aten_graph:
# Running graph with interpreter is needed for propagating the stack_trace
def graph_with_interpreter(*args):
with torch.fx.traceback.preserve_node_meta():
return torch.fx.Interpreter(graph).run(*args)
- fake_tensor_mode = null_context()
- for val in example_fake_inputs:
- if isinstance(val, FakeTensor):
- fake_tensor_mode = val.fake_mode
- break
-
- with enable_python_dispatcher(), fake_tensor_mode:
+ with enable_python_dispatcher(), fake_mode:
graph = make_fx(
graph_with_interpreter,
decomposition_table=decomposition_table,
diff --git a/docs/main/_modules/torch/_tensor_str.html b/docs/main/_modules/torch/_tensor_str.html
index b7c93f8c43b0..63ac4fcb92f3 100644
--- a/docs/main/_modules/torch/_tensor_str.html
+++ b/docs/main/_modules/torch/_tensor_str.html
@@ -991,12 +991,15 @@ Source code for torch._tensor_str
prefix = "_to_functional_tensor("
tensor_str = repr(torch._from_functional_tensor(self))
else:
- if self.is_meta:
+ # Circular import problem, so we import it here
+ from torch._subclasses.fake_tensor import FakeTensor
+
+ if self.is_meta or isinstance(self, FakeTensor):
suffixes.append("size=" + str(tuple(self.shape)))
if self.dtype != torch.get_default_dtype():
suffixes.append("dtype=" + str(self.dtype))
# TODO: This implies that ellipses is valid syntax for allocating
- # a meta tensor, which it could be, but it isn't right now
+ # a meta tensor or FakeTensor, which it could be, but it isn't right now
if not custom_contents_provided:
tensor_str = "..."
else:
diff --git a/docs/main/_modules/torch/cuda.html b/docs/main/_modules/torch/cuda.html
index ed9291b763a7..746ea6e90f87 100644
--- a/docs/main/_modules/torch/cuda.html
+++ b/docs/main/_modules/torch/cuda.html
@@ -1352,6 +1352,64 @@ Source code for torch.cuda
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+ r"""Return the torch.device type object from the passed in device.
+
+ Args:
+ device (torch.device or int): selected device.
+ """
+ if isinstance(device, str):
+ device = torch.device(device)
+ elif isinstance(device, int):
+ device = torch.device('cuda', device)
+ return device
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+ r"""Return the CUDA Generator object for the given device.
+
+ Args:
+ device (torch.device): selected device.
+ """
+
+ idx = device.index
+ if idx is None:
+ idx = current_device()
+ return torch.cuda.default_generators[idx]
+
+
+def _set_rng_state_offset(offset: int, device: Union[int, str, torch.device] = 'cuda') -> None:
+ r"""Sets the random number generator state offset of the specified GPU.
+
+ Args:
+ offset (int): The desired offset
+ device (torch.device or int, optional): The device to set the RNG state.
+ Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+ """
+ final_device = _get_device(device)
+
+ def cb():
+ default_generator = _get_generator(final_device)
+ default_generator.set_offset(offset)
+
+ _lazy_call(cb)
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = 'cuda') -> int:
+ r"""Returns the random number generator state offset of the specified GPU.
+
+ Args:
+ device (torch.device or int, optional): The device to return the RNG state offset of.
+ Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+ .. warning::
+ This function eagerly initializes CUDA.
+ """
+ _lazy_init()
+ final_device = _get_device(device)
+ default_generator = _get_generator(final_device)
+ return default_generator.get_offset()
+
+
from .memory import * # noqa: F403
diff --git a/docs/main/_modules/torch/utils/data/dataloader.html b/docs/main/_modules/torch/utils/data/dataloader.html
index 99b56cc73162..ff7b44bd57a1 100644
--- a/docs/main/_modules/torch/utils/data/dataloader.html
+++ b/docs/main/_modules/torch/utils/data/dataloader.html
@@ -618,18 +618,21 @@ Source code for torch.utils.data.dataloader
worker_init_fn (Callable, optional): If not ``None``, this will be called on each
worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
input, after seeding and before data loading. (default: ``None``)
+ multiprocessing_context (str or multiprocessing.context.BaseContext, optional): If
+ ``None``, the default `multiprocessing context`_ of your operating system will
+ be used. (default: ``None``)
generator (torch.Generator, optional): If not ``None``, this RNG will be used
by RandomSampler to generate random indexes and multiprocessing to generate
- `base_seed` for workers. (default: ``None``)
+ ``base_seed`` for workers. (default: ``None``)
prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
in advance by each worker. ``2`` means there will be a total of
2 * num_workers batches prefetched across all workers. (default value depends
on the set value for num_workers. If value of num_workers=0 default is ``None``.
- Otherwise if value of num_workers>0 default is ``2``).
- persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
+ Otherwise, if value of ``num_workers > 0`` default is ``2``).
+ persistent_workers (bool, optional): If ``True``, the data loader will not shut down
the worker processes after a dataset has been consumed once. This allows to
maintain the workers `Dataset` instances alive. (default: ``False``)
- pin_memory_device (str, optional): the device to pin memory to if ``pin_memory`` is
+ pin_memory_device (str, optional): the device to :attr:`pin_memory` to if ``pin_memory`` is
``True``.
@@ -658,6 +661,9 @@ Source code for torch.utils.data.dataloader
.. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
:ref:`data-loading-randomness` notes for random seed related questions.
+
+ .. _multiprocessing context:
+ https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
"""
dataset: Dataset[T_co]
batch_size: Optional[int]
diff --git a/docs/main/_sources/compile/nn-module.rst.txt b/docs/main/_sources/compile/nn-module.rst.txt
index cad950e387cb..21a8e624a247 100644
--- a/docs/main/_sources/compile/nn-module.rst.txt
+++ b/docs/main/_sources/compile/nn-module.rst.txt
@@ -28,6 +28,13 @@ By default, `torch.compile` will trace the contents of `nn.Module.__call__` whic
and run forward/pre-forward hooks. If you install hooks before calling `torch.compile` and then do not remove
or alter the hooks later, your use case should be supported by default.
+Backward/Pre-backward hooks are generally also supported, with similar caveats: currently graph-breaks in dynamo
+occur when accessing backward_hooks dicts, which is probably avoiable with some work. Graph-breaks also impact the
+timing of firing backward hooks, since graph-segments are run as autograd-functions which produce all their grads at
+the same time. Assuming it were possible for dynamo to not graph-break on the presence of backward-hooks, we would
+still expect the backward hooks for a series of modules to all fire together after the whole compiled graph's backward
+ran.
+
**hooks on 'allowed modules'**
`torch.compile` treats common modules such as torch.conv, as well as modules that are difficult to trace, specially
by allowing them to be called opaquely in the dynamo graph instead of traced into by dynamo. For such modules, hooks
diff --git a/docs/main/compile/nn-module.html b/docs/main/compile/nn-module.html
index 1ca276e63247..d0523d720bb7 100644
--- a/docs/main/compile/nn-module.html
+++ b/docs/main/compile/nn-module.html
@@ -478,6 +478,12 @@ nn.Module.__call__ Hooks Usage and limitationsNone, this will be called on each
worker subprocess with the worker id (an int in [0, num_workers - 1]
) as
input, after seeding and before data loading. (default: None
)
+multiprocessing_context (str or multiprocessing.context.BaseContext, optional) – If
+None
, the default multiprocessing context of your operating system will
+be used. (default: None
)
generator (torch.Generator, optional) – If not None
, this RNG will be used
by RandomSampler to generate random indexes and multiprocessing to generate
-base_seed for workers. (default: None
)
+base_seed
for workers. (default: None
)
prefetch_factor (int, optional, keyword-only arg) – Number of batches loaded
in advance by each worker. 2
means there will be a total of
2 * num_workers batches prefetched across all workers. (default value depends
on the set value for num_workers. If value of num_workers=0 default is None
.
-Otherwise if value of num_workers>0 default is 2
).
-persistent_workers (bool, optional) – If True
, the data loader will not shutdown
+Otherwise, if value of num_workers > 0
default is 2
).
+persistent_workers (bool, optional) – If True
, the data loader will not shut down
the worker processes after a dataset has been consumed once. This allows to
maintain the workers Dataset instances alive. (default: False
)
-pin_memory_device (str, optional) – the device to pin memory to if pin_memory
is
+
pin_memory_device (str, optional) – the device to pin_memory
to if pin_memory
is
True
.
diff --git a/docs/main/quantization-backend-configuration.html b/docs/main/quantization-backend-configuration.html
index c2a17faeca06..ad73b75ffc24 100644
--- a/docs/main/quantization-backend-configuration.html
+++ b/docs/main/quantization-backend-configuration.html
@@ -479,7 +479,7 @@ Default values for native configurationsOperator Tags
class torch.Tag¶
Members:
-core
-generated
nondeterministic_bitwise
-pointwise
dynamic_output_shape
-view_copy
nondeterministic_seeded
-data_dependent_output
+view_copy
inplace_view
+core
+generated
+pointwise
+data_dependent_output
-
property name¶