diff --git a/docs/main/_images/RReLU.png b/docs/main/_images/RReLU.png index 4ef2552b9950..a0a467f023dc 100644 Binary files a/docs/main/_images/RReLU.png and b/docs/main/_images/RReLU.png differ diff --git a/docs/main/_modules/torch.html b/docs/main/_modules/torch.html index 21000b19299f..6770913807cb 100644 --- a/docs/main/_modules/torch.html +++ b/docs/main/_modules/torch.html @@ -1982,6 +1982,12 @@

Source code for torch

 
         return compile_fx(model_, inputs_, config_patches=self.config)
 
+    def reset(self):
+        from torch._inductor import config
+        if "triton.cudagraphs" in self.config or config.triton.cudagraphs:
+            if self.config.get("triton.cudagraphs", True):
+                from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+                reset_cudagraph_trees()
 
 
[docs]def compile(model: Optional[Callable] = None, *, fullgraph: builtins.bool = False, diff --git a/docs/main/_modules/torch/_dynamo.html b/docs/main/_modules/torch/_dynamo.html index df8f7254129e..de29d1345f79 100644 --- a/docs/main/_modules/torch/_dynamo.html +++ b/docs/main/_modules/torch/_dynamo.html @@ -506,6 +506,8 @@

Source code for torch._dynamo

     orig_code_map.clear()
     guard_failures.clear()
     resume_execution.ContinueExecutionCache.cache.clear()
+    if hasattr(eval_frame.most_recent_backend, "reset"):
+        eval_frame.most_recent_backend.reset()
     eval_frame.most_recent_backend = None
     compilation_metrics.clear()
     reset_frame_count()
diff --git a/docs/main/_modules/torch/_dynamo/eval_frame.html b/docs/main/_modules/torch/_dynamo/eval_frame.html index a91c295fa105..17e0fb61d93a 100644 --- a/docs/main/_modules/torch/_dynamo/eval_frame.html +++ b/docs/main/_modules/torch/_dynamo/eval_frame.html @@ -510,7 +510,6 @@

Source code for torch._dynamo.eval_frame

 log = logging.getLogger(__name__)
 
 from torch._dispatch.python import enable_python_dispatcher
-from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental import proxy_tensor
 
 always_optimize_code_objects = utils.ExactWeakKeyDictionary()
@@ -901,7 +900,7 @@ 

Source code for torch._dynamo.eval_frame

     elif sys.version_info >= (3, 11):
         warnings.warn(
             "torch.compile support of Python 3.11 is experimental. "
-            "Program may generate incorrect results or segfault."
+            "Program may segfault."
         )
 
 
@@ -1154,7 +1153,6 @@ 

Source code for torch._dynamo.eval_frame

     graph = None
     out_guards = None
     graph_captured_input = None
-    example_fake_inputs = []
     graph_captured_result: Optional[Tuple[torch.Tensor, ...]] = None
 
     def produce_matching(source_args, candidate_args):
@@ -1191,8 +1189,11 @@ 

Source code for torch._dynamo.eval_frame

         assert out_guards is None, "whole graph export entails exactly one guard export"
         out_guards = guards
 
+    fake_mode = None
+    example_inputs = []
+
     def dynamo_normalization_capturing_compiler(
-        gm: torch.fx.GraphModule, example_inputs
+        gm: torch.fx.GraphModule, inner_example_inputs
     ):
         nonlocal graph
         assert (
@@ -1200,8 +1201,9 @@ 

Source code for torch._dynamo.eval_frame

         ), "Tried to emit a second graph during export. Tracing through 'f' must produce a single graph."
         graph = gm
 
-        nonlocal example_fake_inputs
-        example_fake_inputs = example_inputs
+        nonlocal fake_mode, example_inputs
+        fake_mode = _guards.detect_fake_mode(inner_example_inputs)
+        example_inputs = inner_example_inputs
 
         def result_capturing_wrapper(*graph_inputs):
             nonlocal graph_captured_result
@@ -1238,6 +1240,7 @@ 

Source code for torch._dynamo.eval_frame

         graph is not None
     ), "Failed to produce a graph during tracing. Tracing through 'f' must produce a single graph."
     assert out_guards is not None, "Failed to produce guards during tracing"
+    assert fake_mode is not None
 
     matched_input_elements_positions = produce_matching(flat_args, graph_captured_input)
 
@@ -1283,19 +1286,16 @@ 

Source code for torch._dynamo.eval_frame

                 r.node.meta["val"] = self.current_node.meta["val"]
             return r
 
+    # NB: This is mostly hitting the cache; Dynamo already converted these
+    example_fake_inputs = [fake_mode.from_tensor(t) for t in example_inputs]
+
     if aten_graph:
         # Running graph with interpreter is needed for propagating the stack_trace
         def graph_with_interpreter(*args):
             with torch.fx.traceback.preserve_node_meta():
                 return torch.fx.Interpreter(graph).run(*args)
 
-        fake_tensor_mode = null_context()
-        for val in example_fake_inputs:
-            if isinstance(val, FakeTensor):
-                fake_tensor_mode = val.fake_mode
-                break
-
-        with enable_python_dispatcher(), fake_tensor_mode:
+        with enable_python_dispatcher(), fake_mode:
             graph = make_fx(
                 graph_with_interpreter,
                 decomposition_table=decomposition_table,
diff --git a/docs/main/_modules/torch/_tensor_str.html b/docs/main/_modules/torch/_tensor_str.html
index b7c93f8c43b0..63ac4fcb92f3 100644
--- a/docs/main/_modules/torch/_tensor_str.html
+++ b/docs/main/_modules/torch/_tensor_str.html
@@ -991,12 +991,15 @@ 

Source code for torch._tensor_str

         prefix = "_to_functional_tensor("
         tensor_str = repr(torch._from_functional_tensor(self))
     else:
-        if self.is_meta:
+        # Circular import problem, so we import it here
+        from torch._subclasses.fake_tensor import FakeTensor
+
+        if self.is_meta or isinstance(self, FakeTensor):
             suffixes.append("size=" + str(tuple(self.shape)))
             if self.dtype != torch.get_default_dtype():
                 suffixes.append("dtype=" + str(self.dtype))
             # TODO: This implies that ellipses is valid syntax for allocating
-            # a meta tensor, which it could be, but it isn't right now
+            # a meta tensor or FakeTensor, which it could be, but it isn't right now
             if not custom_contents_provided:
                 tensor_str = "..."
         else:
diff --git a/docs/main/_modules/torch/cuda.html b/docs/main/_modules/torch/cuda.html
index ed9291b763a7..746ea6e90f87 100644
--- a/docs/main/_modules/torch/cuda.html
+++ b/docs/main/_modules/torch/cuda.html
@@ -1352,6 +1352,64 @@ 

Source code for torch.cuda

 
 
 
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device('cuda', device)
+    return device
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the CUDA Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.cuda.default_generators[idx]
+
+
+def _set_rng_state_offset(offset: int, device: Union[int, str, torch.device] = 'cuda') -> None:
+    r"""Sets the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = 'cuda') -> int:
+    r"""Returns the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
 from .memory import *  # noqa: F403
 
 
diff --git a/docs/main/_modules/torch/utils/data/dataloader.html b/docs/main/_modules/torch/utils/data/dataloader.html
index 99b56cc73162..ff7b44bd57a1 100644
--- a/docs/main/_modules/torch/utils/data/dataloader.html
+++ b/docs/main/_modules/torch/utils/data/dataloader.html
@@ -618,18 +618,21 @@ 

Source code for torch.utils.data.dataloader

         worker_init_fn (Callable, optional): If not ``None``, this will be called on each
             worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
             input, after seeding and before data loading. (default: ``None``)
+        multiprocessing_context (str or multiprocessing.context.BaseContext, optional): If
+            ``None``, the default `multiprocessing context`_ of your operating system will
+            be used. (default: ``None``)
         generator (torch.Generator, optional): If not ``None``, this RNG will be used
             by RandomSampler to generate random indexes and multiprocessing to generate
-            `base_seed` for workers. (default: ``None``)
+            ``base_seed`` for workers. (default: ``None``)
         prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
             in advance by each worker. ``2`` means there will be a total of
             2 * num_workers batches prefetched across all workers. (default value depends
             on the set value for num_workers. If value of num_workers=0 default is ``None``.
-            Otherwise if value of num_workers>0 default is ``2``).
-        persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
+            Otherwise, if value of ``num_workers > 0`` default is ``2``).
+        persistent_workers (bool, optional): If ``True``, the data loader will not shut down
             the worker processes after a dataset has been consumed once. This allows to
             maintain the workers `Dataset` instances alive. (default: ``False``)
-        pin_memory_device (str, optional): the device to pin memory to if ``pin_memory`` is
+        pin_memory_device (str, optional): the device to :attr:`pin_memory` to if ``pin_memory`` is
             ``True``.
 
 
@@ -658,6 +661,9 @@ 

Source code for torch.utils.data.dataloader

 
     .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
                  :ref:`data-loading-randomness` notes for random seed related questions.
+
+    .. _multiprocessing context:
+        https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
     """
     dataset: Dataset[T_co]
     batch_size: Optional[int]
diff --git a/docs/main/_sources/compile/nn-module.rst.txt b/docs/main/_sources/compile/nn-module.rst.txt
index cad950e387cb..21a8e624a247 100644
--- a/docs/main/_sources/compile/nn-module.rst.txt
+++ b/docs/main/_sources/compile/nn-module.rst.txt
@@ -28,6 +28,13 @@ By default, `torch.compile` will trace the contents of `nn.Module.__call__` whic
 and run forward/pre-forward hooks.  If you install hooks before calling `torch.compile` and then do not remove
 or alter the hooks later, your use case should be supported by default.
 
+Backward/Pre-backward hooks are generally also supported, with similar caveats: currently graph-breaks in dynamo
+occur when accessing backward_hooks dicts, which is probably avoiable with some work.  Graph-breaks also impact the
+timing of firing backward hooks, since graph-segments are run as autograd-functions which produce all their grads at
+the same time.  Assuming it were possible for dynamo to not graph-break on the presence of backward-hooks, we would
+still expect the backward hooks for a series of modules to all fire together after the whole compiled graph's backward
+ran.
+
 **hooks on 'allowed modules'**
 `torch.compile` treats common modules such as torch.conv, as well as modules that are difficult to trace, specially
 by allowing them to be called opaquely in the dynamo graph instead of traced into by dynamo.  For such modules, hooks
diff --git a/docs/main/compile/nn-module.html b/docs/main/compile/nn-module.html
index 1ca276e63247..d0523d720bb7 100644
--- a/docs/main/compile/nn-module.html
+++ b/docs/main/compile/nn-module.html
@@ -478,6 +478,12 @@ 

nn.Module.__call__ Hooks Usage and limitationsNone, this will be called on each worker subprocess with the worker id (an int in [0, num_workers - 1]) as input, after seeding and before data loading. (default: None)

+
  • multiprocessing_context (str or multiprocessing.context.BaseContext, optional) – If +None, the default multiprocessing context of your operating system will +be used. (default: None)

  • generator (torch.Generator, optional) – If not None, this RNG will be used by RandomSampler to generate random indexes and multiprocessing to generate -base_seed for workers. (default: None)

  • +base_seed for workers. (default: None)

  • prefetch_factor (int, optional, keyword-only arg) – Number of batches loaded in advance by each worker. 2 means there will be a total of 2 * num_workers batches prefetched across all workers. (default value depends on the set value for num_workers. If value of num_workers=0 default is None. -Otherwise if value of num_workers>0 default is 2).

  • -
  • persistent_workers (bool, optional) – If True, the data loader will not shutdown +Otherwise, if value of num_workers > 0 default is 2).

  • +
  • persistent_workers (bool, optional) – If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. (default: False)

  • -
  • pin_memory_device (str, optional) – the device to pin memory to if pin_memory is +

  • pin_memory_device (str, optional) – the device to pin_memory to if pin_memory is True.

  • diff --git a/docs/main/quantization-backend-configuration.html b/docs/main/quantization-backend-configuration.html index c2a17faeca06..ad73b75ffc24 100644 --- a/docs/main/quantization-backend-configuration.html +++ b/docs/main/quantization-backend-configuration.html @@ -479,7 +479,7 @@

    Default values for native configurationsOperator Tags class torch.Tag

    Members:

    -

    core

    -

    generated

    nondeterministic_bitwise

    -

    pointwise

    dynamic_output_shape

    -

    view_copy

    nondeterministic_seeded

    -

    data_dependent_output

    +

    view_copy

    inplace_view

    +

    core

    +

    generated

    +

    pointwise

    +

    data_dependent_output

    property name