From 5c4d4ce7f7fc7584808e476f19263a458d5e3390 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Sat, 22 Jul 2023 00:31:44 +0300 Subject: [PATCH 01/29] feat: initial profiling impl --- DEVELOPMENT.md | 2 +- pyproject.toml | 1 + splunk_otel/distro.py | 5 + splunk_otel/profiling/__init__.py | 252 +++++++++++++++++++++++++++ splunk_otel/profiling/profile_pb2.py | 41 +++++ 5 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 splunk_otel/profiling/__init__.py create mode 100644 splunk_otel/profiling/profile_pb2.py diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 89625dad..6a5d4312 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -26,7 +26,7 @@ the test project's environment. Assuming the test project environment lives at version of package in the test project. ``` -make develop DEV_ENV=/path/to/test/project/venv +make develop DEV_VENV=/path/to/test/project/venv ``` This will install an editable version of the package in the test project. Any diff --git a/pyproject.toml b/pyproject.toml index 372c9c97..724c2e52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ splunk_distro = "splunk_otel.distro:_SplunkDistro" [tool.poetry.dependencies] cryptography=">=2.0,<=40.0.2" python = "^3.7" +protobuf = "^4.23" opentelemetry-api = "1.18.0" opentelemetry-sdk = "1.18.0" opentelemetry-instrumentation = "0.39b0" diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index 86b27246..bc791cd1 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -21,6 +21,7 @@ from splunk_otel.options import _Options from splunk_otel.tracing import _configure_tracing, _is_truthy +from splunk_otel.profiling import _start_profiling logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) @@ -30,6 +31,7 @@ class _SplunkDistro(BaseDistro): def __init__(self): tracing_enabled = os.environ.get("OTEL_TRACE_ENABLED", True) self._tracing_enabled = _is_truthy(tracing_enabled) + self._profiling_enabled = True if not self._tracing_enabled: logger.info( "tracing has been disabled with OTEL_TRACE_ENABLED=%s", tracing_enabled @@ -39,6 +41,9 @@ def _configure(self, **kwargs: Dict[str, Any]) -> None: if self._tracing_enabled: _configure_tracing(_Options()) + if self._profiling_enabled: + _start_profiling() + def load_instrumentor(self, entry_point: EntryPoint, **kwargs): if self._tracing_enabled: super().load_instrumentor(entry_point, **kwargs) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py new file mode 100644 index 00000000..5878fe32 --- /dev/null +++ b/splunk_otel/profiling/__init__.py @@ -0,0 +1,252 @@ +# Copyright Splunk Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time +import sys +import traceback +import gzip +import base64 +import wrapt +from . import profile_pb2 +from collections import OrderedDict + +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import ( + OTLPLogExporter, +) + +from opentelemetry.trace.propagation import _SPAN_KEY +from opentelemetry.context import Context + +from opentelemetry.sdk._logs import LogData, LogRecord +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.trace import TraceFlags +from opentelemetry._logs import SeverityNumber +from opentelemetry.sdk.resources import Resource as SDKResource +from opentelemetry.sdk.util.instrumentation import InstrumentationScope +from opentelemetry.semconv.resource import ResourceAttributes +from splunk_otel.version import __version__ + +thread_states = {} +profiler_thread = None +exporter = OTLPLogExporter() +batch_processor = BatchLogRecordProcessor(exporter) + +class StringTable: + def __init__(self): + self.stringMap = OrderedDict() + + def index(self, s): + idx = self.stringMap.get(s) + + if idx != None: + return idx + + idx = len(self.stringMap) + self.stringMap[s] = idx + return idx + + def keys(self): + return list(self.stringMap.keys()) + + +def _encode_cpu_profile(stacktraces): + str_table = StringTable() + locations_table = OrderedDict() + functions_table = OrderedDict() + + def get_function(file_name, function_name): + key = f"{file_name}:{function_name}" + fun = functions_table.get(key) + + if fun == None: + name_id = str_table.index(function_name) + fun = profile_pb2.Function() + fun.id = len(functions_table) + 1 + fun.name = name_id + fun.system_name = name_id + fun.filename = str_table.index(file_name) + + functions_table[key] = fun + + return fun + + def get_line(file_name, function_name, line_no): + line = profile_pb2.Line() + line.function_id = get_function(file_name, function_name).id + line.line = line_no if line_no != 0 else -1 + return line + + def get_location(frame): + (file_name, function_name, line_no) = frame + key = f"{file_name}:{function_name}:{line_no}" + location = locations_table.get(key) + + if location == None: + location = profile_pb2.Location() + location.id = len(locations_table) + 1 + location.line.append(get_line(file_name, function_name, line_no)) + locations_table[key] = location + + return location + + timestamp_key = str_table.index("source.event.time") + trace_id_key = str_table.index("trace_id") + span_id_key = str_table.index("span_id") + event_period_key = str_table.index("source.event.period") + + pb_profile = profile_pb2.Profile() + + event_period_label = profile_pb2.Label() + event_period_label.key = event_period_key + event_period_label.num = 1000 + + samples = [] + for stacktrace in stacktraces: + timestamp_label = profile_pb2.Label() + timestamp_label.key = timestamp_key + timestamp_label.num = int(stacktrace["timestamp"] / 1e6) + labels = [ + timestamp_label, + event_period_label + ] + + trace_context = thread_states.get(stacktrace["tid"]) + if trace_context: + (trace_id, span_id) = trace_context + + trace_id_label = profile_pb2.Label() + trace_id_label.key = trace_id_key + trace_id_label.str = str_table.index(f"{trace_id:#016x}") + labels.append(trace_id_label) + + span_id_label = profile_pb2.Label() + span_id_label.key = span_id_key + span_id_label.str = str_table.index(f"{span_id:#08x}") + labels.append(span_id_label) + + sample = profile_pb2.Sample() + + location_ids = [] + + for frame in reversed(stacktrace["stacktrace"]): + location_ids.append(get_location(frame).id) + + sample.location_id.extend(location_ids) + sample.label.extend(labels) + + samples.append(sample) + + pb_profile.sample.extend(samples) + pb_profile.string_table.extend(str_table.keys()) + pb_profile.function.extend(list(functions_table.values())) + pb_profile.location.extend(list(locations_table.values())) + + return gzip.compress(pb_profile.SerializeToString()) + +resource = SDKResource({ + ResourceAttributes.DEPLOYMENT_ENVIRONMENT: "pyprofile", + ResourceAttributes.TELEMETRY_SDK_LANGUAGE: "python", + ResourceAttributes.SERVICE_NAME: "pyprofile3", + ResourceAttributes.PROCESS_RUNTIME_NAME: sys.implementation.name, + "splunk.distro.version": __version__ +}) + +attributes = { + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling" +} + +def profiler_loop(): + interval = 1.0 + + while True: + profiling_stacktraces = [] + frames = sys._current_frames() + timestamp = int(time.time() * 1e9) + + for thread_id, frame in frames.items(): + prof_stacktrace_frames = [] + stack = traceback.extract_stack(frame) + for sf in stack: + prof_stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) + prof_stacktrace = { + "timestamp": timestamp, + "stacktrace": prof_stacktrace_frames, + "tid": thread_id + } + profiling_stacktraces.append(prof_stacktrace) + + encoded_profile = base64.b64encode(_encode_cpu_profile(profiling_stacktraces)).decode("utf-8") + log_data = LogData( + log_record=LogRecord( + timestamp=int(time.time() * 1e9), + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=resource, + attributes=attributes, + ), + instrumentation_scope=InstrumentationScope( + "otel.profiling", "0.1.0" + ), + ) + batch_processor.emit(log_data) + time.sleep(interval) + +def _wrapped_context_attach(wrapped, instance, args, kwargs): + token = wrapped(*args, **kwargs) + + maybe_context = args[0] if args else None + + if maybe_context: + span = maybe_context.get(_SPAN_KEY) + + if span: + thread_id = threading.get_ident() + thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + + return token + +def _wrapped_context_detach(wrapped, instance, args, kwargs): + token = args[0] if args else None + + if token: + prev = token.old_value + thread_id = threading.get_ident() + if isinstance(prev, Context): + span = prev.get(_SPAN_KEY) + + if span: + thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + else: + thread_states[thread_id] = None + else: + thread_states[thread_id] = None + return wrapped(*args, **kwargs) + +def _start_profiling(): + wrapt.wrap_function_wrapper( + "opentelemetry.context", "attach", _wrapped_context_attach + ) + wrapt.wrap_function_wrapper( + "opentelemetry.context", "detach", _wrapped_context_detach + ) + profiler_thread = threading.Thread(name="splunk-otel-profiler", target=profiler_loop, daemon=True) + profiler_thread.start() + + diff --git a/splunk_otel/profiling/profile_pb2.py b/splunk_otel/profiling/profile_pb2.py new file mode 100644 index 00000000..db084267 --- /dev/null +++ b/splunk_otel/profiling/profile_pb2.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: profile.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rprofile.proto\x12\x12perftools.profiles\"\xd5\x03\n\x07Profile\x12\x32\n\x0bsample_type\x18\x01 \x03(\x0b\x32\x1d.perftools.profiles.ValueType\x12*\n\x06sample\x18\x02 \x03(\x0b\x32\x1a.perftools.profiles.Sample\x12,\n\x07mapping\x18\x03 \x03(\x0b\x32\x1b.perftools.profiles.Mapping\x12.\n\x08location\x18\x04 \x03(\x0b\x32\x1c.perftools.profiles.Location\x12.\n\x08\x66unction\x18\x05 \x03(\x0b\x32\x1c.perftools.profiles.Function\x12\x14\n\x0cstring_table\x18\x06 \x03(\t\x12\x13\n\x0b\x64rop_frames\x18\x07 \x01(\x03\x12\x13\n\x0bkeep_frames\x18\x08 \x01(\x03\x12\x12\n\ntime_nanos\x18\t \x01(\x03\x12\x16\n\x0e\x64uration_nanos\x18\n \x01(\x03\x12\x32\n\x0bperiod_type\x18\x0b \x01(\x0b\x32\x1d.perftools.profiles.ValueType\x12\x0e\n\x06period\x18\x0c \x01(\x03\x12\x0f\n\x07\x63omment\x18\r \x03(\x03\x12\x1b\n\x13\x64\x65\x66\x61ult_sample_type\x18\x0e \x01(\x03\"\'\n\tValueType\x12\x0c\n\x04type\x18\x01 \x01(\x03\x12\x0c\n\x04unit\x18\x02 \x01(\x03\"V\n\x06Sample\x12\x13\n\x0blocation_id\x18\x01 \x03(\x04\x12\r\n\x05value\x18\x02 \x03(\x03\x12(\n\x05label\x18\x03 \x03(\x0b\x32\x19.perftools.profiles.Label\"@\n\x05Label\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x0b\n\x03str\x18\x02 \x01(\x03\x12\x0b\n\x03num\x18\x03 \x01(\x03\x12\x10\n\x08num_unit\x18\x04 \x01(\x03\"\xdd\x01\n\x07Mapping\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_start\x18\x02 \x01(\x04\x12\x14\n\x0cmemory_limit\x18\x03 \x01(\x04\x12\x13\n\x0b\x66ile_offset\x18\x04 \x01(\x04\x12\x10\n\x08\x66ilename\x18\x05 \x01(\x03\x12\x10\n\x08\x62uild_id\x18\x06 \x01(\x03\x12\x15\n\rhas_functions\x18\x07 \x01(\x08\x12\x15\n\rhas_filenames\x18\x08 \x01(\x08\x12\x18\n\x10has_line_numbers\x18\t \x01(\x08\x12\x19\n\x11has_inline_frames\x18\n \x01(\x08\"v\n\x08Location\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x12\n\nmapping_id\x18\x02 \x01(\x04\x12\x0f\n\x07\x61\x64\x64ress\x18\x03 \x01(\x04\x12&\n\x04line\x18\x04 \x03(\x0b\x32\x18.perftools.profiles.Line\x12\x11\n\tis_folded\x18\x05 \x01(\x08\")\n\x04Line\x12\x13\n\x0b\x66unction_id\x18\x01 \x01(\x04\x12\x0c\n\x04line\x18\x02 \x01(\x03\"_\n\x08\x46unction\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\x03\x12\x13\n\x0bsystem_name\x18\x03 \x01(\x03\x12\x10\n\x08\x66ilename\x18\x04 \x01(\x03\x12\x12\n\nstart_line\x18\x05 \x01(\x03\x42-\n\x1d\x63om.google.perftools.profilesB\x0cProfileProtob\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'profile_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\035com.google.perftools.profilesB\014ProfileProto' + _globals['_PROFILE']._serialized_start=38 + _globals['_PROFILE']._serialized_end=507 + _globals['_VALUETYPE']._serialized_start=509 + _globals['_VALUETYPE']._serialized_end=548 + _globals['_SAMPLE']._serialized_start=550 + _globals['_SAMPLE']._serialized_end=636 + _globals['_LABEL']._serialized_start=638 + _globals['_LABEL']._serialized_end=702 + _globals['_MAPPING']._serialized_start=705 + _globals['_MAPPING']._serialized_end=926 + _globals['_LOCATION']._serialized_start=928 + _globals['_LOCATION']._serialized_end=1046 + _globals['_LINE']._serialized_start=1048 + _globals['_LINE']._serialized_end=1089 + _globals['_FUNCTION']._serialized_start=1091 + _globals['_FUNCTION']._serialized_end=1186 +# @@protoc_insertion_point(module_scope) From 8e652d19cfe84d7be846938f87ed715093e7b4f3 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 24 Jul 2023 18:07:01 +0300 Subject: [PATCH 02/29] feat: better options --- .pylintrc | 6 +- splunk_otel/distro.py | 12 +- splunk_otel/profiling/__init__.py | 430 +++++++++++++-------------- splunk_otel/profiling/options.py | 54 ++++ splunk_otel/profiling/profile_pb2.py | 50 ++-- 5 files changed, 301 insertions(+), 251 deletions(-) create mode 100644 splunk_otel/profiling/options.py diff --git a/.pylintrc b/.pylintrc index c54be234..8313acf0 100644 --- a/.pylintrc +++ b/.pylintrc @@ -170,7 +170,7 @@ contextmanager-decorators=contextlib.contextmanager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. -generated-members=zipkin_pb2.* +generated-members=zipkin_pb2.*,profile_pb2.* # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). @@ -471,7 +471,7 @@ max-bool-expr=5 max-branches=12 # Maximum number of locals for function / method body. -max-locals=15 +max-locals=30 # Maximum number of parents for a class (see R0901). max-parents=7 @@ -483,7 +483,7 @@ max-public-methods=20 max-returns=6 # Maximum number of statements in function / method body. -max-statements=50 +max-statements=100 # Minimum number of public methods for a class (see R0903). min-public-methods=2 diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index bc791cd1..b2ac441d 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -20,8 +20,9 @@ from pkg_resources import EntryPoint from splunk_otel.options import _Options -from splunk_otel.tracing import _configure_tracing, _is_truthy from splunk_otel.profiling import _start_profiling +from splunk_otel.profiling.options import _Options as ProfilingOptions +from splunk_otel.tracing import _configure_tracing, _is_truthy logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) @@ -30,19 +31,22 @@ class _SplunkDistro(BaseDistro): def __init__(self): tracing_enabled = os.environ.get("OTEL_TRACE_ENABLED", True) + profiling_enabled = os.environ.get("SPLUNK_PROFILER_ENABLED", False) self._tracing_enabled = _is_truthy(tracing_enabled) - self._profiling_enabled = True + self._profiling_enabled = _is_truthy(profiling_enabled) if not self._tracing_enabled: logger.info( "tracing has been disabled with OTEL_TRACE_ENABLED=%s", tracing_enabled ) def _configure(self, **kwargs: Dict[str, Any]) -> None: + options = _Options() + if self._tracing_enabled: - _configure_tracing(_Options()) + _configure_tracing(options) if self._profiling_enabled: - _start_profiling() + _start_profiling(ProfilingOptions(options.resource)) def load_instrumentor(self, entry_point: EntryPoint, **kwargs): if self._tracing_enabled: diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 5878fe32..f8fb9d53 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -12,241 +12,229 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 +import gzip +import sys import threading import time -import sys import traceback -import gzip -import base64 -import wrapt -from . import profile_pb2 from collections import OrderedDict -from opentelemetry.exporter.otlp.proto.grpc._log_exporter import ( - OTLPLogExporter, -) - -from opentelemetry.trace.propagation import _SPAN_KEY +import wrapt +from opentelemetry._logs import SeverityNumber from opentelemetry.context import Context - +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.sdk._logs import LogData, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor -from opentelemetry.trace import TraceFlags -from opentelemetry._logs import SeverityNumber -from opentelemetry.sdk.resources import Resource as SDKResource from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.semconv.resource import ResourceAttributes +from opentelemetry.trace import TraceFlags +from opentelemetry.trace.propagation import _SPAN_KEY + +from splunk_otel.profiling import profile_pb2 +from splunk_otel.profiling.options import _Options from splunk_otel.version import __version__ thread_states = {} -profiler_thread = None -exporter = OTLPLogExporter() -batch_processor = BatchLogRecordProcessor(exporter) + class StringTable: - def __init__(self): - self.stringMap = OrderedDict() - - def index(self, s): - idx = self.stringMap.get(s) - - if idx != None: - return idx - - idx = len(self.stringMap) - self.stringMap[s] = idx - return idx - - def keys(self): - return list(self.stringMap.keys()) - - -def _encode_cpu_profile(stacktraces): - str_table = StringTable() - locations_table = OrderedDict() - functions_table = OrderedDict() - - def get_function(file_name, function_name): - key = f"{file_name}:{function_name}" - fun = functions_table.get(key) - - if fun == None: - name_id = str_table.index(function_name) - fun = profile_pb2.Function() - fun.id = len(functions_table) + 1 - fun.name = name_id - fun.system_name = name_id - fun.filename = str_table.index(file_name) - - functions_table[key] = fun - - return fun - - def get_line(file_name, function_name, line_no): - line = profile_pb2.Line() - line.function_id = get_function(file_name, function_name).id - line.line = line_no if line_no != 0 else -1 - return line - - def get_location(frame): - (file_name, function_name, line_no) = frame - key = f"{file_name}:{function_name}:{line_no}" - location = locations_table.get(key) - - if location == None: - location = profile_pb2.Location() - location.id = len(locations_table) + 1 - location.line.append(get_line(file_name, function_name, line_no)) - locations_table[key] = location - - return location - - timestamp_key = str_table.index("source.event.time") - trace_id_key = str_table.index("trace_id") - span_id_key = str_table.index("span_id") - event_period_key = str_table.index("source.event.period") - - pb_profile = profile_pb2.Profile() - - event_period_label = profile_pb2.Label() - event_period_label.key = event_period_key - event_period_label.num = 1000 - - samples = [] - for stacktrace in stacktraces: - timestamp_label = profile_pb2.Label() - timestamp_label.key = timestamp_key - timestamp_label.num = int(stacktrace["timestamp"] / 1e6) - labels = [ - timestamp_label, - event_period_label - ] - - trace_context = thread_states.get(stacktrace["tid"]) - if trace_context: - (trace_id, span_id) = trace_context - - trace_id_label = profile_pb2.Label() - trace_id_label.key = trace_id_key - trace_id_label.str = str_table.index(f"{trace_id:#016x}") - labels.append(trace_id_label) - - span_id_label = profile_pb2.Label() - span_id_label.key = span_id_key - span_id_label.str = str_table.index(f"{span_id:#08x}") - labels.append(span_id_label) - - sample = profile_pb2.Sample() - - location_ids = [] - - for frame in reversed(stacktrace["stacktrace"]): - location_ids.append(get_location(frame).id) - - sample.location_id.extend(location_ids) - sample.label.extend(labels) - - samples.append(sample) - - pb_profile.sample.extend(samples) - pb_profile.string_table.extend(str_table.keys()) - pb_profile.function.extend(list(functions_table.values())) - pb_profile.location.extend(list(locations_table.values())) - - return gzip.compress(pb_profile.SerializeToString()) - -resource = SDKResource({ - ResourceAttributes.DEPLOYMENT_ENVIRONMENT: "pyprofile", - ResourceAttributes.TELEMETRY_SDK_LANGUAGE: "python", - ResourceAttributes.SERVICE_NAME: "pyprofile3", - ResourceAttributes.PROCESS_RUNTIME_NAME: sys.implementation.name, - "splunk.distro.version": __version__ -}) - -attributes = { - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling" -} - -def profiler_loop(): - interval = 1.0 - - while True: - profiling_stacktraces = [] - frames = sys._current_frames() - timestamp = int(time.time() * 1e9) - - for thread_id, frame in frames.items(): - prof_stacktrace_frames = [] - stack = traceback.extract_stack(frame) - for sf in stack: - prof_stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) - prof_stacktrace = { - "timestamp": timestamp, - "stacktrace": prof_stacktrace_frames, - "tid": thread_id - } - profiling_stacktraces.append(prof_stacktrace) - - encoded_profile = base64.b64encode(_encode_cpu_profile(profiling_stacktraces)).decode("utf-8") - log_data = LogData( - log_record=LogRecord( - timestamp=int(time.time() * 1e9), - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=resource, - attributes=attributes, - ), - instrumentation_scope=InstrumentationScope( - "otel.profiling", "0.1.0" - ), - ) - batch_processor.emit(log_data) - time.sleep(interval) - -def _wrapped_context_attach(wrapped, instance, args, kwargs): - token = wrapped(*args, **kwargs) - - maybe_context = args[0] if args else None - - if maybe_context: - span = maybe_context.get(_SPAN_KEY) - - if span: - thread_id = threading.get_ident() - thread_states[thread_id] = (span.context.trace_id, span.context.span_id) - - return token - -def _wrapped_context_detach(wrapped, instance, args, kwargs): - token = args[0] if args else None - - if token: - prev = token.old_value - thread_id = threading.get_ident() - if isinstance(prev, Context): - span = prev.get(_SPAN_KEY) - - if span: - thread_states[thread_id] = (span.context.trace_id, span.context.span_id) - else: - thread_states[thread_id] = None - else: - thread_states[thread_id] = None - return wrapped(*args, **kwargs) - -def _start_profiling(): - wrapt.wrap_function_wrapper( - "opentelemetry.context", "attach", _wrapped_context_attach - ) - wrapt.wrap_function_wrapper( - "opentelemetry.context", "detach", _wrapped_context_detach - ) - profiler_thread = threading.Thread(name="splunk-otel-profiler", target=profiler_loop, daemon=True) - profiler_thread.start() + def __init__(self): + self.strings = OrderedDict() + + def index(self, token): + idx = self.strings.get(token) + + if idx: + return idx + + idx = len(self.strings) + self.strings[token] = idx + return idx + def keys(self): + return list(self.strings.keys()) + +def _encode_cpu_profile(stacktraces, interval): + str_table = StringTable() + locations_table = OrderedDict() + functions_table = OrderedDict() + + def get_function(file_name, function_name): + key = f"{file_name}:{function_name}" + fun = functions_table.get(key) + + if fun is None: + name_id = str_table.index(function_name) + fun = profile_pb2.Function() + fun.id = len(functions_table) + 1 + fun.name = name_id + fun.system_name = name_id + fun.filename = str_table.index(file_name) + functions_table[key] = fun + + return fun + + def get_line(file_name, function_name, line_no): + line = profile_pb2.Line() + line.function_id = get_function(file_name, function_name).id + line.line = line_no if line_no != 0 else -1 + return line + + def get_location(frame): + (file_name, function_name, line_no) = frame + key = f"{file_name}:{function_name}:{line_no}" + location = locations_table.get(key) + + if location is None: + location = profile_pb2.Location() + location.id = len(locations_table) + 1 + location.line.append(get_line(file_name, function_name, line_no)) + locations_table[key] = location + + return location + + timestamp_key = str_table.index("source.event.time") + trace_id_key = str_table.index("trace_id") + span_id_key = str_table.index("span_id") + event_period_key = str_table.index("source.event.period") + + pb_profile = profile_pb2.Profile() + + event_period_label = profile_pb2.Label() + event_period_label.key = event_period_key + event_period_label.num = interval + + samples = [] + for stacktrace in stacktraces: + timestamp_label = profile_pb2.Label() + timestamp_label.key = timestamp_key + timestamp_label.num = int(stacktrace["timestamp"] / 1e6) + labels = [timestamp_label, event_period_label] + + trace_context = thread_states.get(stacktrace["tid"]) + if trace_context: + (trace_id, span_id) = trace_context + + trace_id_label = profile_pb2.Label() + trace_id_label.key = trace_id_key + trace_id_label.str = str_table.index(f"{trace_id:#016x}") + labels.append(trace_id_label) + + span_id_label = profile_pb2.Label() + span_id_label.key = span_id_key + span_id_label.str = str_table.index(f"{span_id:#08x}") + labels.append(span_id_label) + + sample = profile_pb2.Sample() + + location_ids = [] + + for frame in reversed(stacktrace["stacktrace"]): + location_ids.append(get_location(frame).id) + + sample.location_id.extend(location_ids) + sample.label.extend(labels) + + samples.append(sample) + + pb_profile.sample.extend(samples) + pb_profile.string_table.extend(str_table.keys()) + pb_profile.function.extend(list(functions_table.values())) + pb_profile.location.extend(list(locations_table.values())) + + return gzip.compress(pb_profile.SerializeToString()) + + +def _profiler_loop(options: _Options): + interval = options.call_stack_interval + + exporter = OTLPLogExporter(options.endpoint) + batch_processor = BatchLogRecordProcessor(exporter) + + while True: + profiling_stacktraces = [] + frames = sys._current_frames() + timestamp = int(time.time() * 1e9) + + for thread_id, frame in frames.items(): + prof_stacktrace_frames = [] + stack = traceback.extract_stack(frame) + for sf in stack: + prof_stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) + prof_stacktrace = { + "timestamp": timestamp, + "stacktrace": prof_stacktrace_frames, + "tid": thread_id, + } + profiling_stacktraces.append(prof_stacktrace) + + encoded_profile = base64.b64encode( + _encode_cpu_profile(profiling_stacktraces, interval) + ).decode() + log_data = LogData( + log_record=LogRecord( + timestamp=timestamp, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=options.resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + }, + ), + instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), + ) + batch_processor.emit(log_data) + time.sleep(interval / 1e3) + + +def _wrapped_context_attach(wrapped, _instance, args, kwargs): + token = wrapped(*args, **kwargs) + + maybe_context = args[0] if args else None + + if maybe_context: + span = maybe_context.get(_SPAN_KEY) + + if span: + thread_id = threading.get_ident() + thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + + return token + + +def _wrapped_context_detach(wrapped, _instance, args, kwargs): + token = args[0] if args else None + + if token: + prev = token.old_value + thread_id = threading.get_ident() + if isinstance(prev, Context): + span = prev.get(_SPAN_KEY) + + if span: + thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + else: + thread_states[thread_id] = None + else: + thread_states[thread_id] = None + return wrapped(*args, **kwargs) + + +def _start_profiling(options: _Options): + wrapt.wrap_function_wrapper( + "opentelemetry.context", "attach", _wrapped_context_attach + ) + wrapt.wrap_function_wrapper( + "opentelemetry.context", "detach", _wrapped_context_detach + ) + profiler_thread = threading.Thread( + name="splunk-otel-profiler", target=_profiler_loop, args=[options], daemon=True + ) + profiler_thread.start() diff --git a/splunk_otel/profiling/options.py b/splunk_otel/profiling/options.py new file mode 100644 index 00000000..bb7e346c --- /dev/null +++ b/splunk_otel/profiling/options.py @@ -0,0 +1,54 @@ +# Copyright Splunk Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from os import environ +from typing import Optional + +from opentelemetry.sdk.resources import Resource + + +class _Options: + resource: Resource + endpoint: str + call_stack_interval: int + + def __init__( + self, + resource: Resource, + endpoint: Optional[str] = None, + call_stack_interval: Optional[int] = None, + ): + self.resource = resource + self.endpoint = _Options._get_endpoint(endpoint) + self.call_stack_interval = _Options._get_call_stack_interval(call_stack_interval) + + @staticmethod + def _get_endpoint(endpoint: Optional[str]) -> str: + if not endpoint: + endpoint = environ.get("SPLUNK_PROFILER_LOGS_ENDPOINT") + + if not endpoint: + endpoint = environ.get("OTEL_EXPORTER_OTLP_ENDPOINT") + + return endpoint or "http://localhost:4317" + + @staticmethod + def _get_call_stack_interval(interval: Optional[int]) -> int: + if not interval: + interval = environ.get("SPLUNK_PROFILER_CALL_STACK_INTERVAL") + + if interval: + return int(interval) + + return 1_000 diff --git a/splunk_otel/profiling/profile_pb2.py b/splunk_otel/profiling/profile_pb2.py index db084267..777da69d 100644 --- a/splunk_otel/profiling/profile_pb2.py +++ b/splunk_otel/profiling/profile_pb2.py @@ -1,3 +1,4 @@ +# pylint: skip-file # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: profile.proto @@ -6,36 +7,39 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rprofile.proto\x12\x12perftools.profiles\"\xd5\x03\n\x07Profile\x12\x32\n\x0bsample_type\x18\x01 \x03(\x0b\x32\x1d.perftools.profiles.ValueType\x12*\n\x06sample\x18\x02 \x03(\x0b\x32\x1a.perftools.profiles.Sample\x12,\n\x07mapping\x18\x03 \x03(\x0b\x32\x1b.perftools.profiles.Mapping\x12.\n\x08location\x18\x04 \x03(\x0b\x32\x1c.perftools.profiles.Location\x12.\n\x08\x66unction\x18\x05 \x03(\x0b\x32\x1c.perftools.profiles.Function\x12\x14\n\x0cstring_table\x18\x06 \x03(\t\x12\x13\n\x0b\x64rop_frames\x18\x07 \x01(\x03\x12\x13\n\x0bkeep_frames\x18\x08 \x01(\x03\x12\x12\n\ntime_nanos\x18\t \x01(\x03\x12\x16\n\x0e\x64uration_nanos\x18\n \x01(\x03\x12\x32\n\x0bperiod_type\x18\x0b \x01(\x0b\x32\x1d.perftools.profiles.ValueType\x12\x0e\n\x06period\x18\x0c \x01(\x03\x12\x0f\n\x07\x63omment\x18\r \x03(\x03\x12\x1b\n\x13\x64\x65\x66\x61ult_sample_type\x18\x0e \x01(\x03\"\'\n\tValueType\x12\x0c\n\x04type\x18\x01 \x01(\x03\x12\x0c\n\x04unit\x18\x02 \x01(\x03\"V\n\x06Sample\x12\x13\n\x0blocation_id\x18\x01 \x03(\x04\x12\r\n\x05value\x18\x02 \x03(\x03\x12(\n\x05label\x18\x03 \x03(\x0b\x32\x19.perftools.profiles.Label\"@\n\x05Label\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x0b\n\x03str\x18\x02 \x01(\x03\x12\x0b\n\x03num\x18\x03 \x01(\x03\x12\x10\n\x08num_unit\x18\x04 \x01(\x03\"\xdd\x01\n\x07Mapping\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_start\x18\x02 \x01(\x04\x12\x14\n\x0cmemory_limit\x18\x03 \x01(\x04\x12\x13\n\x0b\x66ile_offset\x18\x04 \x01(\x04\x12\x10\n\x08\x66ilename\x18\x05 \x01(\x03\x12\x10\n\x08\x62uild_id\x18\x06 \x01(\x03\x12\x15\n\rhas_functions\x18\x07 \x01(\x08\x12\x15\n\rhas_filenames\x18\x08 \x01(\x08\x12\x18\n\x10has_line_numbers\x18\t \x01(\x08\x12\x19\n\x11has_inline_frames\x18\n \x01(\x08\"v\n\x08Location\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x12\n\nmapping_id\x18\x02 \x01(\x04\x12\x0f\n\x07\x61\x64\x64ress\x18\x03 \x01(\x04\x12&\n\x04line\x18\x04 \x03(\x0b\x32\x18.perftools.profiles.Line\x12\x11\n\tis_folded\x18\x05 \x01(\x08\")\n\x04Line\x12\x13\n\x0b\x66unction_id\x18\x01 \x01(\x04\x12\x0c\n\x04line\x18\x02 \x01(\x03\"_\n\x08\x46unction\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\x03\x12\x13\n\x0bsystem_name\x18\x03 \x01(\x03\x12\x10\n\x08\x66ilename\x18\x04 \x01(\x03\x12\x12\n\nstart_line\x18\x05 \x01(\x03\x42-\n\x1d\x63om.google.perftools.profilesB\x0cProfileProtob\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\rprofile.proto\x12\x12perftools.profiles"\xd5\x03\n\x07Profile\x12\x32\n\x0bsample_type\x18\x01 \x03(\x0b\x32\x1d.perftools.profiles.ValueType\x12*\n\x06sample\x18\x02 \x03(\x0b\x32\x1a.perftools.profiles.Sample\x12,\n\x07mapping\x18\x03 \x03(\x0b\x32\x1b.perftools.profiles.Mapping\x12.\n\x08location\x18\x04 \x03(\x0b\x32\x1c.perftools.profiles.Location\x12.\n\x08\x66unction\x18\x05 \x03(\x0b\x32\x1c.perftools.profiles.Function\x12\x14\n\x0cstring_table\x18\x06 \x03(\t\x12\x13\n\x0b\x64rop_frames\x18\x07 \x01(\x03\x12\x13\n\x0bkeep_frames\x18\x08 \x01(\x03\x12\x12\n\ntime_nanos\x18\t \x01(\x03\x12\x16\n\x0e\x64uration_nanos\x18\n \x01(\x03\x12\x32\n\x0bperiod_type\x18\x0b \x01(\x0b\x32\x1d.perftools.profiles.ValueType\x12\x0e\n\x06period\x18\x0c \x01(\x03\x12\x0f\n\x07\x63omment\x18\r \x03(\x03\x12\x1b\n\x13\x64\x65\x66\x61ult_sample_type\x18\x0e \x01(\x03"\'\n\tValueType\x12\x0c\n\x04type\x18\x01 \x01(\x03\x12\x0c\n\x04unit\x18\x02 \x01(\x03"V\n\x06Sample\x12\x13\n\x0blocation_id\x18\x01 \x03(\x04\x12\r\n\x05value\x18\x02 \x03(\x03\x12(\n\x05label\x18\x03 \x03(\x0b\x32\x19.perftools.profiles.Label"@\n\x05Label\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x0b\n\x03str\x18\x02 \x01(\x03\x12\x0b\n\x03num\x18\x03 \x01(\x03\x12\x10\n\x08num_unit\x18\x04 \x01(\x03"\xdd\x01\n\x07Mapping\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_start\x18\x02 \x01(\x04\x12\x14\n\x0cmemory_limit\x18\x03 \x01(\x04\x12\x13\n\x0b\x66ile_offset\x18\x04 \x01(\x04\x12\x10\n\x08\x66ilename\x18\x05 \x01(\x03\x12\x10\n\x08\x62uild_id\x18\x06 \x01(\x03\x12\x15\n\rhas_functions\x18\x07 \x01(\x08\x12\x15\n\rhas_filenames\x18\x08 \x01(\x08\x12\x18\n\x10has_line_numbers\x18\t \x01(\x08\x12\x19\n\x11has_inline_frames\x18\n \x01(\x08"v\n\x08Location\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x12\n\nmapping_id\x18\x02 \x01(\x04\x12\x0f\n\x07\x61\x64\x64ress\x18\x03 \x01(\x04\x12&\n\x04line\x18\x04 \x03(\x0b\x32\x18.perftools.profiles.Line\x12\x11\n\tis_folded\x18\x05 \x01(\x08")\n\x04Line\x12\x13\n\x0b\x66unction_id\x18\x01 \x01(\x04\x12\x0c\n\x04line\x18\x02 \x01(\x03"_\n\x08\x46unction\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\x03\x12\x13\n\x0bsystem_name\x18\x03 \x01(\x03\x12\x10\n\x08\x66ilename\x18\x04 \x01(\x03\x12\x12\n\nstart_line\x18\x05 \x01(\x03\x42-\n\x1d\x63om.google.perftools.profilesB\x0cProfileProtob\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'profile_pb2', _globals) -if _descriptor._USE_C_DESCRIPTORS == False: +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "profile_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS is False: - DESCRIPTOR._options = None - DESCRIPTOR._serialized_options = b'\n\035com.google.perftools.profilesB\014ProfileProto' - _globals['_PROFILE']._serialized_start=38 - _globals['_PROFILE']._serialized_end=507 - _globals['_VALUETYPE']._serialized_start=509 - _globals['_VALUETYPE']._serialized_end=548 - _globals['_SAMPLE']._serialized_start=550 - _globals['_SAMPLE']._serialized_end=636 - _globals['_LABEL']._serialized_start=638 - _globals['_LABEL']._serialized_end=702 - _globals['_MAPPING']._serialized_start=705 - _globals['_MAPPING']._serialized_end=926 - _globals['_LOCATION']._serialized_start=928 - _globals['_LOCATION']._serialized_end=1046 - _globals['_LINE']._serialized_start=1048 - _globals['_LINE']._serialized_end=1089 - _globals['_FUNCTION']._serialized_start=1091 - _globals['_FUNCTION']._serialized_end=1186 + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = ( + b"\n\035com.google.perftools.profilesB\014ProfileProto" + ) + _globals["_PROFILE"]._serialized_start = 38 + _globals["_PROFILE"]._serialized_end = 507 + _globals["_VALUETYPE"]._serialized_start = 509 + _globals["_VALUETYPE"]._serialized_end = 548 + _globals["_SAMPLE"]._serialized_start = 550 + _globals["_SAMPLE"]._serialized_end = 636 + _globals["_LABEL"]._serialized_start = 638 + _globals["_LABEL"]._serialized_end = 702 + _globals["_MAPPING"]._serialized_start = 705 + _globals["_MAPPING"]._serialized_end = 926 + _globals["_LOCATION"]._serialized_start = 928 + _globals["_LOCATION"]._serialized_end = 1046 + _globals["_LINE"]._serialized_start = 1048 + _globals["_LINE"]._serialized_end = 1089 + _globals["_FUNCTION"]._serialized_start = 1091 + _globals["_FUNCTION"]._serialized_end = 1186 # @@protoc_insertion_point(module_scope) From 8b2db33eedf350420a3e03a4a7de0a42448ba916 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 24 Jul 2023 18:08:42 +0300 Subject: [PATCH 03/29] chore: add todo --- splunk_otel/profiling/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index f8fb9d53..092f0943 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -160,6 +160,7 @@ def _profiler_loop(options: _Options): for thread_id, frame in frames.items(): prof_stacktrace_frames = [] + # TODO: This is potentially really slow due to code line lookups in the file stack = traceback.extract_stack(frame) for sf in stack: prof_stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) From a335ce46d185cfb9a2b0433446e23948d686bf98 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Thu, 12 Oct 2023 23:54:43 +0300 Subject: [PATCH 04/29] add tests --- .pylintrc | 3 +- splunk_otel/__init__.py | 1 + splunk_otel/distro.py | 1 - splunk_otel/profiling/__init__.py | 43 ++++++- splunk_otel/profiling/options.py | 4 +- splunk_otel/profiling/profile_pb2.py | 1 - tests/integration/conftest.py | 6 +- tests/integration/docker-compose.yml | 2 +- tests/unit/test_profiling.py | 169 +++++++++++++++++++++++++++ 9 files changed, 217 insertions(+), 13 deletions(-) create mode 100644 tests/unit/test_profiling.py diff --git a/.pylintrc b/.pylintrc index fb5c5855..7214b01b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -439,7 +439,8 @@ exclude-protected=_asdict, _replace, _source, _make, - _Span + _Span, + _current_frames # List of valid names for the first argument in a class method. valid-classmethod-first-arg=cls diff --git a/splunk_otel/__init__.py b/splunk_otel/__init__.py index efac50d8..a620135b 100644 --- a/splunk_otel/__init__.py +++ b/splunk_otel/__init__.py @@ -25,5 +25,6 @@ # pylint: disable=wrong-import-position from .metrics import start_metrics # noqa: E402 +from .profiling import start_profiling # noqa: E402 from .tracing import start_tracing # noqa: E402 from .version import __version__ # noqa: E402 diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index 861c919b..48dc5ab8 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -26,7 +26,6 @@ from splunk_otel.tracing import _configure_tracing from splunk_otel.util import _is_truthy - otel_log_level = os.environ.get("OTEL_LOG_LEVEL", logging.INFO) logger = logging.getLogger(__file__) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 092f0943..8296710f 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -19,6 +19,7 @@ import time import traceback from collections import OrderedDict +from typing import Dict, Optional, Union import wrapt from opentelemetry._logs import SeverityNumber @@ -31,11 +32,13 @@ from opentelemetry.trace import TraceFlags from opentelemetry.trace.propagation import _SPAN_KEY +import splunk_otel from splunk_otel.profiling import profile_pb2 from splunk_otel.profiling.options import _Options from splunk_otel.version import __version__ thread_states = {} +batch_processor = None class StringTable: @@ -68,7 +71,7 @@ def get_function(file_name, function_name): if fun is None: name_id = str_table.index(function_name) fun = profile_pb2.Function() - fun.id = len(functions_table) + 1 + fun.id = len(functions_table) fun.name = name_id fun.system_name = name_id fun.filename = str_table.index(file_name) @@ -89,7 +92,7 @@ def get_location(frame): if location is None: location = profile_pb2.Location() - location.id = len(locations_table) + 1 + location.id = len(locations_table) location.line.append(get_line(file_name, function_name, line_no)) locations_table[key] = location @@ -98,6 +101,7 @@ def get_location(frame): timestamp_key = str_table.index("source.event.time") trace_id_key = str_table.index("trace_id") span_id_key = str_table.index("span_id") + thread_id_key = str_table.index("thread.id") event_period_key = str_table.index("source.event.period") pb_profile = profile_pb2.Profile() @@ -108,12 +112,19 @@ def get_location(frame): samples = [] for stacktrace in stacktraces: + thread_id = stacktrace["tid"] + timestamp_label = profile_pb2.Label() timestamp_label.key = timestamp_key timestamp_label.num = int(stacktrace["timestamp"] / 1e6) - labels = [timestamp_label, event_period_label] - trace_context = thread_states.get(stacktrace["tid"]) + thread_id_label = profile_pb2.Label() + thread_id_label.key = thread_id_key + thread_id_label.num = thread_id + + labels = [timestamp_label, event_period_label, thread_id_label] + + trace_context = thread_states.get(thread_id) if trace_context: (trace_id, span_id) = trace_context @@ -151,6 +162,8 @@ def _profiler_loop(options: _Options): interval = options.call_stack_interval exporter = OTLPLogExporter(options.endpoint) + # pylint: disable-next=global-statement + global batch_processor batch_processor = BatchLogRecordProcessor(exporter) while True: @@ -228,14 +241,34 @@ def _wrapped_context_detach(wrapped, _instance, args, kwargs): return wrapped(*args, **kwargs) -def _start_profiling(options: _Options): +def _force_flush(): + if batch_processor: + batch_processor.force_flush() + + +def _start_profiling(options): wrapt.wrap_function_wrapper( "opentelemetry.context", "attach", _wrapped_context_attach ) wrapt.wrap_function_wrapper( "opentelemetry.context", "detach", _wrapped_context_detach ) + profiler_thread = threading.Thread( name="splunk-otel-profiler", target=_profiler_loop, args=[options], daemon=True ) profiler_thread.start() + + +def start_profiling( + service_name: Optional[str] = None, + resource_attributes: Optional[Dict[str, Union[str, bool, int, float]]] = None, + endpoint: Optional[str] = None, + call_stack_interval: Optional[int] = None, +): + # pylint: disable-next=protected-access + resource = splunk_otel.options._Options._get_resource( + service_name, resource_attributes + ) + options = _Options(resource, endpoint, call_stack_interval) + _start_profiling(options) diff --git a/splunk_otel/profiling/options.py b/splunk_otel/profiling/options.py index bb7e346c..3ec08ff4 100644 --- a/splunk_otel/profiling/options.py +++ b/splunk_otel/profiling/options.py @@ -51,4 +51,6 @@ def _get_call_stack_interval(interval: Optional[int]) -> int: if interval: return int(interval) - return 1_000 + return 1_000 + + return interval diff --git a/splunk_otel/profiling/profile_pb2.py b/splunk_otel/profiling/profile_pb2.py index 777da69d..a597ba98 100644 --- a/splunk_otel/profiling/profile_pb2.py +++ b/splunk_otel/profiling/profile_pb2.py @@ -21,7 +21,6 @@ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "profile_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS is False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\035com.google.perftools.profilesB\014ProfileProto" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 122aba0b..e5916c14 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -36,7 +36,7 @@ def integration(pytestconfig, docker_ip, docker_services): timeout=10, pause=0.1, check=lambda: is_responsive(url) ) return IntegrationSession( - poll_url=f"http://{docker_ip}:8378", + poll_url=f"http://{docker_ip}:8378/spans", rootdir=path.join(str(pytestconfig.rootdir), "tests", "integration"), ) @@ -45,7 +45,7 @@ def integration(pytestconfig, docker_ip, docker_services): def integration_local(pytestconfig): """Local non-docer based replacement for `integration` fixture""" return IntegrationSession( - poll_url="http://localhost:8378", + poll_url="http://localhost:8378/spans", rootdir=path.join(str(pytestconfig.rootdir), "tests", "integration"), ) @@ -55,6 +55,6 @@ def is_responsive(url) -> bool: response = requests.get(url, timeout=10) if response.status_code == 200: return True - except ConnectionError: + except requests.exceptions.ConnectionError: pass return False diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index 5119c056..3290d1fc 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -1,7 +1,7 @@ version: '3' services: collector: - image: quay.io/signalfx/splunk-otel-collector:0.48.0 + image: quay.io/signalfx/splunk-otel-collector:0.85.0 environment: - SPLUNK_CONFIG=/etc/otel/config.yml volumes: diff --git a/tests/unit/test_profiling.py b/tests/unit/test_profiling.py new file mode 100644 index 00000000..68b9700c --- /dev/null +++ b/tests/unit/test_profiling.py @@ -0,0 +1,169 @@ +# Copyright Splunk Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import gzip +import random +import threading +import time +import unittest +from unittest.mock import patch + +from opentelemetry import trace as trace_api +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter +from opentelemetry.sdk._logs.export import LogExportResult +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor + +from splunk_otel.profiling import _force_flush, profile_pb2, start_profiling + + +def do_work(time_ms): + now = time.time() + target = now + time_ms / 1000.0 + + total = 0.0 + while now < target: + value = random.random() + for _ in range(0, 10000): + value = value + random.random() + + total = total + value + + now = time.time() + time.sleep(0.01) + + return total + + +def find_label(sample, name, strings): + for label in sample.label: + if strings[label.key] == name: + return label + return None + + +class TestProfiling(unittest.TestCase): + def setUp(self): + self.span_id = None + self.trace_id = None + self.export_patcher = patch.object(OTLPLogExporter, "export") + self.export_mock = self.export_patcher.start() + self.export_mock.return_value = LogExportResult.SUCCESS + + def tearDown(self): + self.export_patcher.stop() + + def _assert_scope(self, scope): + self.assertEqual(scope.name, "otel.profiling") + self.assertEqual(scope.version, "0.1.0") + + def _assert_log_record(self, log_record): + self.assertTrue(int(time.time() * 1e9) - log_record.timestamp <= 2e9) + + self.assertEqual(log_record.trace_id, 0) + self.assertEqual(log_record.span_id, 0) + # Attributes are of type BoundedAttributes, get the underlying dict + self.assertDictEqual( + log_record.attributes.copy(), + { + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + }, + ) + + resource = log_record.resource.attributes + + self.assertEqual(resource["foo"], "bar") + self.assertEqual(resource["telemetry.sdk.language"], "python") + self.assertEqual(resource["service.name"], "prof-export-test") + + body = log_record.body + pprof_gzipped = base64.b64decode(body) + pprof = gzip.decompress(pprof_gzipped) + profile = profile_pb2.Profile() + profile.ParseFromString(pprof) + + strings = profile.string_table + locations = profile.location + functions = profile.function + + main_thread_samples = [] + + for sample in profile.sample: + thread_id = find_label(sample, "thread.id", strings).num + period = find_label(sample, "source.event.period", strings).num + self.assertEqual(period, 100) + + # Only care for the main thread which is running our busy function + if thread_id == threading.get_ident(): + main_thread_samples.append(sample) + + for sample in main_thread_samples: + for location_id in sample.location_id: + location = locations[location_id] + function = functions[location.line[0].function_id] + function_name = strings[function.name] + file_name = strings[function.filename] + self.assertGreater(len(file_name), 0) + + if function_name == "do_work": + span_id = int(strings[find_label(sample, "span_id", strings).str], 16) + trace_id = int( + strings[find_label(sample, "trace_id", strings).str], 16 + ) + self.assertEqual(span_id, self.span_id) + self.assertEqual(trace_id, self.trace_id) + return True + + return False + + def test_profiling_export(self): + provider = TracerProvider() + processor = SimpleSpanProcessor(ConsoleSpanExporter()) + provider.add_span_processor(processor) + trace_api.set_tracer_provider(provider) + tracer = trace_api.get_tracer("tests.tracer") + + start_profiling( + service_name="prof-export-test", + resource_attributes={"foo": "bar"}, + call_stack_interval=100, + ) + + with tracer.start_as_current_span("add-some-numbers") as span: + span_ctx = span.get_span_context() + self.span_id = span_ctx.span_id + self.trace_id = span_ctx.trace_id + do_work(550) + + _force_flush() + + self.export_mock.assert_called() + + args = self.export_mock.call_args + (log_datas,) = args[0] + + # We expect at least 5 profiling cycles during a 550ms run with call_stack_interval=100 + self.assertGreaterEqual(len(log_datas), 5) + + busy_function_profiled = False + for log_data in log_datas: + log_record = log_data.log_record + self._assert_scope(log_data.instrumentation_scope) + if self._assert_log_record(log_record): + busy_function_profiled = True + + self.assertTrue(busy_function_profiled) From aca9fa09d7b247c245666d1ad344a1ce73ab6bfe Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 30 Oct 2023 12:19:45 +0200 Subject: [PATCH 05/29] add debug logs --- splunk_otel/distro.py | 8 ++------ splunk_otel/metrics.py | 8 ++------ splunk_otel/profiling/__init__.py | 9 +++++++++ splunk_otel/tracing.py | 8 ++------ splunk_otel/util.py | 24 ++++++++++++++++++++++++ 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index 48dc5ab8..4bd24f99 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os from typing import Any, Dict @@ -24,12 +23,9 @@ from splunk_otel.profiling import _start_profiling from splunk_otel.profiling.options import _Options as ProfilingOptions from splunk_otel.tracing import _configure_tracing -from splunk_otel.util import _is_truthy +from splunk_otel.util import _get_logger, _is_truthy -otel_log_level = os.environ.get("OTEL_LOG_LEVEL", logging.INFO) - -logger = logging.getLogger(__file__) -logger.setLevel(otel_log_level) +logger = _get_logger(__name__) class _SplunkDistro(BaseDistro): diff --git a/splunk_otel/metrics.py b/splunk_otel/metrics.py index 4b19b3db..8db04ce4 100644 --- a/splunk_otel/metrics.py +++ b/splunk_otel/metrics.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter @@ -21,12 +20,9 @@ from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from splunk_otel.util import _is_truthy +from splunk_otel.util import _get_logger, _is_truthy -otel_log_level = os.environ.get("OTEL_LOG_LEVEL", logging.INFO) - -logger = logging.getLogger(__file__) -logger.setLevel(otel_log_level) +logger = _get_logger(__name__) def start_metrics() -> MeterProvider: diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 8296710f..dec5e56d 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -14,6 +14,7 @@ import base64 import gzip +import os import sys import threading import time @@ -35,8 +36,11 @@ import splunk_otel from splunk_otel.profiling import profile_pb2 from splunk_otel.profiling.options import _Options +from splunk_otel.util import _get_logger from splunk_otel.version import __version__ +logger = _get_logger(__name__) + thread_states = {} batch_processor = None @@ -247,6 +251,11 @@ def _force_flush(): def _start_profiling(options): + logger.debug( + "starting profiling call_stack_interval=%s endpoint=%s", + options.call_stack_interval, + options.endpoint, + ) wrapt.wrap_function_wrapper( "opentelemetry.context", "attach", _wrapped_context_attach ) diff --git a/splunk_otel/tracing.py b/splunk_otel/tracing.py index 392063e3..371e1631 100644 --- a/splunk_otel/tracing.py +++ b/splunk_otel/tracing.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os import sys from typing import Collection, Dict, Optional, Union @@ -27,12 +26,9 @@ from pkg_resources import iter_entry_points from splunk_otel.options import _Options, _SpanExporterFactory -from splunk_otel.util import _is_truthy +from splunk_otel.util import _get_logger, _is_truthy -otel_log_level = os.environ.get("OTEL_LOG_LEVEL", logging.INFO) - -logger = logging.getLogger(__file__) -logger.setLevel(otel_log_level) +logger = _get_logger(__name__) def start_tracing( diff --git a/splunk_otel/util.py b/splunk_otel/util.py index 4ebc6ec2..add919da 100644 --- a/splunk_otel/util.py +++ b/splunk_otel/util.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import os from typing import Any @@ -19,3 +21,25 @@ def _is_truthy(value: Any) -> bool: if isinstance(value, str): value = value.lower().strip() return value in [True, 1, "true", "yes"] + + +def _get_log_level(level): + levels = { + "none": logging.NOTSET, + "debug": logging.DEBUG, + "info": logging.INFO, + "warn": logging.WARNING, + "error": logging.ERROR, + "fatal": logging.CRITICAL, + } + + return levels[level.lower()] + + +def _get_logger(name): + level = _get_log_level(os.environ.get("OTEL_LOG_LEVEL", "info")) + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(logging.StreamHandler()) + + return logger From f52bb888b5fcf3c7bfc13a027d43808eeac19afb Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 30 Oct 2023 13:18:15 +0200 Subject: [PATCH 06/29] fix logging, add additional warnings --- splunk_otel/__init__.py | 3 +++ splunk_otel/distro.py | 5 ++-- splunk_otel/metrics.py | 5 ++-- splunk_otel/profiling/__init__.py | 45 +++++++++++++++++++++---------- splunk_otel/profiling/options.py | 29 +++++++++++++++++--- splunk_otel/tracing.py | 5 ++-- splunk_otel/util.py | 4 +-- 7 files changed, 70 insertions(+), 26 deletions(-) diff --git a/splunk_otel/__init__.py b/splunk_otel/__init__.py index a620135b..e4e23384 100644 --- a/splunk_otel/__init__.py +++ b/splunk_otel/__init__.py @@ -20,6 +20,9 @@ """ from .defaults import _set_otel_defaults +from .util import _init_logger + +_init_logger("splunk_otel") _set_otel_defaults() diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index 4bd24f99..bb4444e0 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from typing import Any, Dict @@ -23,9 +24,9 @@ from splunk_otel.profiling import _start_profiling from splunk_otel.profiling.options import _Options as ProfilingOptions from splunk_otel.tracing import _configure_tracing -from splunk_otel.util import _get_logger, _is_truthy +from splunk_otel.util import _is_truthy -logger = _get_logger(__name__) +logger = logging.getLogger(__name__) class _SplunkDistro(BaseDistro): diff --git a/splunk_otel/metrics.py b/splunk_otel/metrics.py index 8db04ce4..c1b3eeac 100644 --- a/splunk_otel/metrics.py +++ b/splunk_otel/metrics.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter @@ -20,9 +21,9 @@ from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from splunk_otel.util import _get_logger, _is_truthy +from splunk_otel.util import _is_truthy -logger = _get_logger(__name__) +logger = logging.getLogger(__name__) def start_metrics() -> MeterProvider: diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index dec5e56d..ad426f1b 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -14,6 +14,7 @@ import base64 import gzip +import logging import os import sys import threading @@ -36,13 +37,19 @@ import splunk_otel from splunk_otel.profiling import profile_pb2 from splunk_otel.profiling.options import _Options -from splunk_otel.util import _get_logger from splunk_otel.version import __version__ -logger = _get_logger(__name__) +logger = logging.getLogger(__name__) -thread_states = {} -batch_processor = None + +class Profiler: + def __init__(self): + self.running = False + self.thread_states = {} + self.batch_processor = None + + +profiler = Profiler() class StringTable: @@ -128,7 +135,7 @@ def get_location(frame): labels = [timestamp_label, event_period_label, thread_id_label] - trace_context = thread_states.get(thread_id) + trace_context = profiler.thread_states.get(thread_id) if trace_context: (trace_id, span_id) = trace_context @@ -167,8 +174,7 @@ def _profiler_loop(options: _Options): exporter = OTLPLogExporter(options.endpoint) # pylint: disable-next=global-statement - global batch_processor - batch_processor = BatchLogRecordProcessor(exporter) + profiler.batch_processor = BatchLogRecordProcessor(exporter) while True: profiling_stacktraces = [] @@ -208,7 +214,7 @@ def _profiler_loop(options: _Options): ), instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), ) - batch_processor.emit(log_data) + profiler.batch_processor.emit(log_data) time.sleep(interval / 1e3) @@ -222,7 +228,10 @@ def _wrapped_context_attach(wrapped, _instance, args, kwargs): if span: thread_id = threading.get_ident() - thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + profiler.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) return token @@ -237,20 +246,28 @@ def _wrapped_context_detach(wrapped, _instance, args, kwargs): span = prev.get(_SPAN_KEY) if span: - thread_states[thread_id] = (span.context.trace_id, span.context.span_id) + profiler.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) else: - thread_states[thread_id] = None + profiler.thread_states[thread_id] = None else: - thread_states[thread_id] = None + profiler.thread_states[thread_id] = None return wrapped(*args, **kwargs) def _force_flush(): - if batch_processor: - batch_processor.force_flush() + if profiler.batch_processor: + profiler.batch_processor.force_flush() def _start_profiling(options): + if profiler.running: + logger.warning("profiler already running") + return + + profiler.running = True logger.debug( "starting profiling call_stack_interval=%s endpoint=%s", options.call_stack_interval, diff --git a/splunk_otel/profiling/options.py b/splunk_otel/profiling/options.py index 3ec08ff4..4f3bd585 100644 --- a/splunk_otel/profiling/options.py +++ b/splunk_otel/profiling/options.py @@ -12,11 +12,34 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from os import environ from typing import Optional from opentelemetry.sdk.resources import Resource +logger = logging.getLogger(__name__) +_DEFAULT_CALL_STACK_INTERVAL = 1_000 + + +def _sanitize_interval(interval): + if isinstance(interval, int): + if interval < 1: + logger.warning( + "call stack interval has to be positive, got %s, defaulting to %s", + interval, + _DEFAULT_CALL_STACK_INTERVAL, + ) + return _DEFAULT_CALL_STACK_INTERVAL + + return interval + + logger.warning( + "call stack interval not an integer, defaulting to %s", + _DEFAULT_CALL_STACK_INTERVAL, + ) + return _DEFAULT_CALL_STACK_INTERVAL + class _Options: resource: Resource @@ -49,8 +72,8 @@ def _get_call_stack_interval(interval: Optional[int]) -> int: interval = environ.get("SPLUNK_PROFILER_CALL_STACK_INTERVAL") if interval: - return int(interval) + return _sanitize_interval(int(interval)) - return 1_000 + return _DEFAULT_CALL_STACK_INTERVAL - return interval + return _sanitize_interval(interval) diff --git a/splunk_otel/tracing.py b/splunk_otel/tracing.py index 371e1631..b27bee76 100644 --- a/splunk_otel/tracing.py +++ b/splunk_otel/tracing.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os import sys from typing import Collection, Dict, Optional, Union @@ -26,9 +27,9 @@ from pkg_resources import iter_entry_points from splunk_otel.options import _Options, _SpanExporterFactory -from splunk_otel.util import _get_logger, _is_truthy +from splunk_otel.util import _is_truthy -logger = _get_logger(__name__) +logger = logging.getLogger(__name__) def start_tracing( diff --git a/splunk_otel/util.py b/splunk_otel/util.py index add919da..b6f3c2bf 100644 --- a/splunk_otel/util.py +++ b/splunk_otel/util.py @@ -36,10 +36,8 @@ def _get_log_level(level): return levels[level.lower()] -def _get_logger(name): +def _init_logger(name): level = _get_log_level(os.environ.get("OTEL_LOG_LEVEL", "info")) logger = logging.getLogger(name) logger.setLevel(level) logger.addHandler(logging.StreamHandler()) - - return logger From 625e3f045a34489b379395347e227c8697a82ab5 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 30 Oct 2023 20:39:52 +0200 Subject: [PATCH 07/29] readd nonzero location and function ids, add stack frame count --- splunk_otel/profiling/__init__.py | 5 +++-- tests/unit/test_profiling.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index ad426f1b..a4771230 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -82,7 +82,7 @@ def get_function(file_name, function_name): if fun is None: name_id = str_table.index(function_name) fun = profile_pb2.Function() - fun.id = len(functions_table) + fun.id = len(functions_table) + 1 fun.name = name_id fun.system_name = name_id fun.filename = str_table.index(file_name) @@ -103,7 +103,7 @@ def get_location(frame): if location is None: location = profile_pb2.Location() - location.id = len(locations_table) + location.id = len(locations_table) + 1 location.line.append(get_line(file_name, function_name, line_no)) locations_table[key] = location @@ -210,6 +210,7 @@ def _profiler_loop(options: _Options): "profiling.data.format": "pprof-gzip-base64", "profiling.data.type": "cpu", "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(profiling_stacktraces) }, ), instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), diff --git a/tests/unit/test_profiling.py b/tests/unit/test_profiling.py index 68b9700c..1711d56f 100644 --- a/tests/unit/test_profiling.py +++ b/tests/unit/test_profiling.py @@ -81,6 +81,7 @@ def _assert_log_record(self, log_record): "profiling.data.format": "pprof-gzip-base64", "profiling.data.type": "cpu", "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": 5, }, ) @@ -113,8 +114,8 @@ def _assert_log_record(self, log_record): for sample in main_thread_samples: for location_id in sample.location_id: - location = locations[location_id] - function = functions[location.line[0].function_id] + location = locations[location_id - 1] + function = functions[location.line[0].function_id - 1] function_name = strings[function.name] file_name = strings[function.filename] self.assertGreater(len(file_name), 0) From 3f54b42e15aeb53eba0181fd09dac1f9e9243abd Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 30 Oct 2023 20:40:59 +0200 Subject: [PATCH 08/29] fix: format --- splunk_otel/profiling/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index a4771230..03700690 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -173,7 +173,6 @@ def _profiler_loop(options: _Options): interval = options.call_stack_interval exporter = OTLPLogExporter(options.endpoint) - # pylint: disable-next=global-statement profiler.batch_processor = BatchLogRecordProcessor(exporter) while True: @@ -210,7 +209,7 @@ def _profiler_loop(options: _Options): "profiling.data.format": "pprof-gzip-base64", "profiling.data.type": "cpu", "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(profiling_stacktraces) + "profiling.data.total.frame.count": len(profiling_stacktraces), }, ), instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), From d259b7fadb649dc8a7926b4cbb4ec2f8d95cc969 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Tue, 31 Oct 2023 10:59:03 +0200 Subject: [PATCH 09/29] remove unused imports --- splunk_otel/profiling/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 03700690..aa81edd2 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -15,7 +15,6 @@ import base64 import gzip import logging -import os import sys import threading import time @@ -30,14 +29,12 @@ from opentelemetry.sdk._logs import LogData, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.semconv.resource import ResourceAttributes from opentelemetry.trace import TraceFlags from opentelemetry.trace.propagation import _SPAN_KEY import splunk_otel from splunk_otel.profiling import profile_pb2 from splunk_otel.profiling.options import _Options -from splunk_otel.version import __version__ logger = logging.getLogger(__name__) From b5f998587225bacf6d01816e65c185a521a29d82 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Thu, 2 Nov 2023 13:46:31 +0200 Subject: [PATCH 10/29] add stop_profiling, filtering of internal threads --- .pylintrc | 2 +- splunk_otel/profiling/__init__.py | 186 ++++++++++++++++++++---------- splunk_otel/profiling/options.py | 14 +++ tests/unit/test_profiling.py | 98 ++++++++++++---- 4 files changed, 219 insertions(+), 81 deletions(-) diff --git a/.pylintrc b/.pylintrc index 7214b01b..b2abd0f9 100644 --- a/.pylintrc +++ b/.pylintrc @@ -464,7 +464,7 @@ max-bool-expr=5 max-branches=12 # Maximum number of locals for function / method body. -max-locals=30 +max-locals=32 # Maximum number of parents for a class (see R0901). max-parents=7 diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index aa81edd2..f3702422 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -22,10 +22,12 @@ from collections import OrderedDict from typing import Dict, Optional, Union +import opentelemetry.context import wrapt from opentelemetry._logs import SeverityNumber -from opentelemetry.context import Context +from opentelemetry.context import Context, attach, detach from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter +from opentelemetry.instrumentation.utils import unwrap from opentelemetry.sdk._logs import LogData, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.util.instrumentation import InstrumentationScope @@ -41,12 +43,13 @@ class Profiler: def __init__(self): + self.condition = threading.Condition(threading.Lock()) self.running = False self.thread_states = {} self.batch_processor = None -profiler = Profiler() +_profiler = Profiler() class StringTable: @@ -67,7 +70,7 @@ def keys(self): return list(self.strings.keys()) -def _encode_cpu_profile(stacktraces, interval): +def _encode_cpu_profile(stacktraces, timestamp_unix_nanos, interval): str_table = StringTable() locations_table = OrderedDict() functions_table = OrderedDict() @@ -106,6 +109,8 @@ def get_location(frame): return location + timestamp_unix_millis = int(timestamp_unix_nanos / 1e6) + timestamp_key = str_table.index("source.event.time") trace_id_key = str_table.index("trace_id") span_id_key = str_table.index("span_id") @@ -124,7 +129,7 @@ def get_location(frame): timestamp_label = profile_pb2.Label() timestamp_label.key = timestamp_key - timestamp_label.num = int(stacktrace["timestamp"] / 1e6) + timestamp_label.num = timestamp_unix_millis thread_id_label = profile_pb2.Label() thread_id_label.key = thread_id_key @@ -132,7 +137,7 @@ def get_location(frame): labels = [timestamp_label, event_period_label, thread_id_label] - trace_context = profiler.thread_states.get(thread_id) + trace_context = _profiler.thread_states.get(thread_id) if trace_context: (trace_id, span_id) = trace_context @@ -166,53 +171,105 @@ def get_location(frame): return gzip.compress(pb_profile.SerializeToString()) +def _extract_log_processor_thread_id(processor): + if hasattr(processor, "_worker_thread"): + # pylint: disable-next=protected-access + worker_thread = processor._worker_thread + + if isinstance(worker_thread, threading.Thread): + return worker_thread.ident + + return None + + +def _get_ignored_thread_ids(profiler, include_internal_stacks=False): + if include_internal_stacks: + return [] + + ignored_ids = [threading.get_ident()] + + log_processor_thread_id = _extract_log_processor_thread_id(profiler.batch_processor) + + if log_processor_thread_id: + ignored_ids.append(log_processor_thread_id) + + return ignored_ids + + +def _collect_stacktraces(ignored_thread_ids): + stacktraces = [] + + frames = sys._current_frames() + + for thread_id, frame in frames.items(): + if thread_id in ignored_thread_ids: + continue + + stacktrace_frames = [] + # TODO: This is potentially really slow due to code line lookups in the file + stack = traceback.extract_stack(frame) + for sf in stack: + stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) + stacktrace = { + "stacktrace": stacktrace_frames, + "tid": thread_id, + } + stacktraces.append(stacktrace) + return stacktraces + + +def _to_log_data(stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource): + encoded_profile = base64.b64encode( + _encode_cpu_profile(stacktraces, timestamp_unix_nanos, call_stack_interval_millis) + ).decode() + + return LogData( + log_record=LogRecord( + timestamp=timestamp_unix_nanos, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(stacktraces), + }, + ), + instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), + ) + + def _profiler_loop(options: _Options): - interval = options.call_stack_interval + call_stack_interval_millis = options.call_stack_interval exporter = OTLPLogExporter(options.endpoint) - profiler.batch_processor = BatchLogRecordProcessor(exporter) + _profiler.batch_processor = BatchLogRecordProcessor(exporter) + + ignored_thread_ids = _get_ignored_thread_ids( + _profiler, include_internal_stacks=options.include_internal_stacks + ) - while True: - profiling_stacktraces = [] - frames = sys._current_frames() + while _profiler.running: timestamp = int(time.time() * 1e9) - for thread_id, frame in frames.items(): - prof_stacktrace_frames = [] - # TODO: This is potentially really slow due to code line lookups in the file - stack = traceback.extract_stack(frame) - for sf in stack: - prof_stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) - prof_stacktrace = { - "timestamp": timestamp, - "stacktrace": prof_stacktrace_frames, - "tid": thread_id, - } - profiling_stacktraces.append(prof_stacktrace) - - encoded_profile = base64.b64encode( - _encode_cpu_profile(profiling_stacktraces, interval) - ).decode() - log_data = LogData( - log_record=LogRecord( - timestamp=timestamp, - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=options.resource, - attributes={ - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(profiling_stacktraces), - }, - ), - instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), + stacktraces = _collect_stacktraces(ignored_thread_ids) + + log_data = _to_log_data( + stacktraces, timestamp, call_stack_interval_millis, options.resource ) - profiler.batch_processor.emit(log_data) - time.sleep(interval / 1e3) + + _profiler.batch_processor.emit(log_data) + + with _profiler.condition: + _profiler.condition.wait(call_stack_interval_millis / 1e3) + + _profiler.batch_processor.shutdown() + with _profiler.condition: + _profiler.condition.notify_all() def _wrapped_context_attach(wrapped, _instance, args, kwargs): @@ -225,7 +282,7 @@ def _wrapped_context_attach(wrapped, _instance, args, kwargs): if span: thread_id = threading.get_ident() - profiler.thread_states[thread_id] = ( + _profiler.thread_states[thread_id] = ( span.context.trace_id, span.context.span_id, ) @@ -243,39 +300,35 @@ def _wrapped_context_detach(wrapped, _instance, args, kwargs): span = prev.get(_SPAN_KEY) if span: - profiler.thread_states[thread_id] = ( + _profiler.thread_states[thread_id] = ( span.context.trace_id, span.context.span_id, ) else: - profiler.thread_states[thread_id] = None + _profiler.thread_states[thread_id] = None else: - profiler.thread_states[thread_id] = None + _profiler.thread_states[thread_id] = None return wrapped(*args, **kwargs) def _force_flush(): - if profiler.batch_processor: - profiler.batch_processor.force_flush() + if _profiler.batch_processor: + _profiler.batch_processor.force_flush() def _start_profiling(options): - if profiler.running: + if _profiler.running: logger.warning("profiler already running") return - profiler.running = True + _profiler.running = True logger.debug( "starting profiling call_stack_interval=%s endpoint=%s", options.call_stack_interval, options.endpoint, ) - wrapt.wrap_function_wrapper( - "opentelemetry.context", "attach", _wrapped_context_attach - ) - wrapt.wrap_function_wrapper( - "opentelemetry.context", "detach", _wrapped_context_detach - ) + wrapt.wrap_function_wrapper(opentelemetry.context, "attach", _wrapped_context_attach) + wrapt.wrap_function_wrapper(opentelemetry.context, "detach", _wrapped_context_detach) profiler_thread = threading.Thread( name="splunk-otel-profiler", target=_profiler_loop, args=[options], daemon=True @@ -295,3 +348,18 @@ def start_profiling( ) options = _Options(resource, endpoint, call_stack_interval) _start_profiling(options) + + +def stop_profiling(): + if not _profiler.running: + return + + _profiler.running = False + with _profiler.condition: + # Wake up the profiler thread + _profiler.condition.notify_all() + # Wait for the profiler thread to exit + _profiler.condition.wait() + + unwrap(opentelemetry.context, "attach") + unwrap(opentelemetry.context, "detach") diff --git a/splunk_otel/profiling/options.py b/splunk_otel/profiling/options.py index 4f3bd585..0ef32510 100644 --- a/splunk_otel/profiling/options.py +++ b/splunk_otel/profiling/options.py @@ -18,8 +18,11 @@ from opentelemetry.sdk.resources import Resource +from splunk_otel.util import _is_truthy + logger = logging.getLogger(__name__) _DEFAULT_CALL_STACK_INTERVAL = 1_000 +_DEFAULT_INCLUDE_INTERNAL_STACKS = False def _sanitize_interval(interval): @@ -51,10 +54,14 @@ def __init__( resource: Resource, endpoint: Optional[str] = None, call_stack_interval: Optional[int] = None, + include_internal_stacks: Optional[bool] = None, ): self.resource = resource self.endpoint = _Options._get_endpoint(endpoint) self.call_stack_interval = _Options._get_call_stack_interval(call_stack_interval) + self.include_internal_stacks = _Options._include_internal_stacks( + include_internal_stacks + ) @staticmethod def _get_endpoint(endpoint: Optional[str]) -> str: @@ -77,3 +84,10 @@ def _get_call_stack_interval(interval: Optional[int]) -> int: return _DEFAULT_CALL_STACK_INTERVAL return _sanitize_interval(interval) + + @staticmethod + def _include_internal_stacks(include: Optional[bool]) -> bool: + if include is None: + return _is_truthy(environ.get("SPLUNK_PROFILER_INCLUDE_INTERNAL_STACKS")) + + return include diff --git a/tests/unit/test_profiling.py b/tests/unit/test_profiling.py index 1711d56f..afd65128 100644 --- a/tests/unit/test_profiling.py +++ b/tests/unit/test_profiling.py @@ -14,11 +14,12 @@ import base64 import gzip +import os import random import threading import time import unittest -from unittest.mock import patch +from unittest import mock from opentelemetry import trace as trace_api from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter @@ -26,7 +27,12 @@ from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor -from splunk_otel.profiling import _force_flush, profile_pb2, start_profiling +from splunk_otel.profiling import ( + _force_flush, + profile_pb2, + start_profiling, + stop_profiling, +) def do_work(time_ms): @@ -54,16 +60,57 @@ def find_label(sample, name, strings): return None +def log_record_to_profile(log_record): + body = log_record.body + pprof_gzipped = base64.b64decode(body) + pprof = gzip.decompress(pprof_gzipped) + profile = profile_pb2.Profile() + profile.ParseFromString(pprof) + return profile + + class TestProfiling(unittest.TestCase): + @classmethod + def setUpClass(cls): + provider = TracerProvider() + processor = SimpleSpanProcessor(ConsoleSpanExporter()) + provider.add_span_processor(processor) + trace_api.set_tracer_provider(provider) + def setUp(self): self.span_id = None self.trace_id = None - self.export_patcher = patch.object(OTLPLogExporter, "export") + self.export_patcher = mock.patch.object(OTLPLogExporter, "export") self.export_mock = self.export_patcher.start() self.export_mock.return_value = LogExportResult.SUCCESS def tearDown(self): self.export_patcher.stop() + stop_profiling() + + def profile_capture_thread_ids(self): + start_profiling( + service_name="prof-thread-filter", + call_stack_interval=10, + ) + + do_work(100) + stop_profiling() + + args = self.export_mock.call_args + (log_datas,) = args[0] + self.assertGreaterEqual(len(log_datas), 5) + + thread_ids = set() + for log_data in log_datas: + profile = log_record_to_profile(log_data.log_record) + + strings = profile.string_table + + for sample in profile.sample: + thread_ids.add(find_label(sample, "thread.id", strings).num) + + return thread_ids def _assert_scope(self, scope): self.assertEqual(scope.name, "otel.profiling") @@ -75,15 +122,13 @@ def _assert_log_record(self, log_record): self.assertEqual(log_record.trace_id, 0) self.assertEqual(log_record.span_id, 0) # Attributes are of type BoundedAttributes, get the underlying dict - self.assertDictEqual( - log_record.attributes.copy(), - { - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": 5, - }, - ) + attributes = log_record.attributes.copy() + + self.assertEqual(attributes["profiling.data.format"], "pprof-gzip-base64") + self.assertEqual(attributes["profiling.data.type"], "cpu") + self.assertEqual(attributes["com.splunk.sourcetype"], "otel.profiling") + # We should at least have the main thread + self.assertGreaterEqual(attributes["profiling.data.total.frame.count"], 1) resource = log_record.resource.attributes @@ -91,11 +136,7 @@ def _assert_log_record(self, log_record): self.assertEqual(resource["telemetry.sdk.language"], "python") self.assertEqual(resource["service.name"], "prof-export-test") - body = log_record.body - pprof_gzipped = base64.b64decode(body) - pprof = gzip.decompress(pprof_gzipped) - profile = profile_pb2.Profile() - profile.ParseFromString(pprof) + profile = log_record_to_profile(log_record) strings = profile.string_table locations = profile.location @@ -132,10 +173,6 @@ def _assert_log_record(self, log_record): return False def test_profiling_export(self): - provider = TracerProvider() - processor = SimpleSpanProcessor(ConsoleSpanExporter()) - provider.add_span_processor(processor) - trace_api.set_tracer_provider(provider) tracer = trace_api.get_tracer("tests.tracer") start_profiling( @@ -168,3 +205,22 @@ def test_profiling_export(self): busy_function_profiled = True self.assertTrue(busy_function_profiled) + + def test_include_internal_threads(self): + with_internal_stacks_thread_ids = set() + with mock.patch.dict( + os.environ, {"SPLUNK_PROFILER_INCLUDE_INTERNAL_STACKS": "true"}, clear=True + ): + with_internal_stacks_thread_ids = self.profile_capture_thread_ids() + + without_internal_stacks_thread_ids = set() + with mock.patch.dict( + os.environ, {"SPLUNK_PROFILER_INCLUDE_INTERNAL_STACKS": "false"}, clear=True + ): + without_internal_stacks_thread_ids = self.profile_capture_thread_ids() + + # Internally the profiler should use 2 threads: the sampler thread and export thread. + count_diff = len(with_internal_stacks_thread_ids) - len( + without_internal_stacks_thread_ids + ) + self.assertEqual(count_diff, 2) From 5cabb8fa344fb25b2888f8cc41ee64263165e596 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Tue, 7 Nov 2023 19:07:58 +0200 Subject: [PATCH 11/29] feat: handle process forking, use logger provider --- splunk_otel/distro.py | 1 + splunk_otel/profiling/__init__.py | 93 ++++++++++++++++++------------- 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index bb4444e0..e58d5631 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -33,6 +33,7 @@ class _SplunkDistro(BaseDistro): def __init__(self): tracing_enabled = os.environ.get("OTEL_TRACE_ENABLED", True) profiling_enabled = os.environ.get("SPLUNK_PROFILER_ENABLED", False) + print("Profiling", profiling_enabled) self._tracing_enabled = _is_truthy(tracing_enabled) self._profiling_enabled = _is_truthy(profiling_enabled) if not self._tracing_enabled: diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index f3702422..d2d0b527 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -15,6 +15,7 @@ import base64 import gzip import logging +import os import sys import threading import time @@ -28,7 +29,7 @@ from opentelemetry.context import Context, attach, detach from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.instrumentation.utils import unwrap -from opentelemetry.sdk._logs import LogData, LogRecord +from opentelemetry.sdk._logs import LogData, LoggerProvider, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.trace import TraceFlags @@ -46,7 +47,9 @@ def __init__(self): self.condition = threading.Condition(threading.Lock()) self.running = False self.thread_states = {} + self.logger_provider = None self.batch_processor = None + self.options = None _profiler = Profiler() @@ -182,13 +185,13 @@ def _extract_log_processor_thread_id(processor): return None -def _get_ignored_thread_ids(profiler, include_internal_stacks=False): +def _get_ignored_thread_ids(batch_processor, include_internal_stacks=False): if include_internal_stacks: return [] ignored_ids = [threading.get_ident()] - log_processor_thread_id = _extract_log_processor_thread_id(profiler.batch_processor) + log_processor_thread_id = _extract_log_processor_thread_id(batch_processor) if log_processor_thread_id: ignored_ids.append(log_processor_thread_id) @@ -218,58 +221,57 @@ def _collect_stacktraces(ignored_thread_ids): return stacktraces -def _to_log_data(stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource): +def _to_log_record( + stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource +): encoded_profile = base64.b64encode( _encode_cpu_profile(stacktraces, timestamp_unix_nanos, call_stack_interval_millis) ).decode() - return LogData( - log_record=LogRecord( - timestamp=timestamp_unix_nanos, - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=resource, - attributes={ - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(stacktraces), - }, - ), - instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), + return LogRecord( + timestamp=timestamp_unix_nanos, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(stacktraces), + }, ) -def _profiler_loop(options: _Options): +def _profiler_loop(profiler: Profiler): + options = profiler.options call_stack_interval_millis = options.call_stack_interval - exporter = OTLPLogExporter(options.endpoint) - _profiler.batch_processor = BatchLogRecordProcessor(exporter) - ignored_thread_ids = _get_ignored_thread_ids( - _profiler, include_internal_stacks=options.include_internal_stacks + profiler.batch_processor, include_internal_stacks=options.include_internal_stacks ) - while _profiler.running: + profiling_logger = profiler.logger_provider.get_logger("otel.profiling", "0.1.0") + + while profiler.running: timestamp = int(time.time() * 1e9) stacktraces = _collect_stacktraces(ignored_thread_ids) - log_data = _to_log_data( + log_record = _to_log_record( stacktraces, timestamp, call_stack_interval_millis, options.resource ) - _profiler.batch_processor.emit(log_data) + profiling_logger.emit(log_record) - with _profiler.condition: - _profiler.condition.wait(call_stack_interval_millis / 1e3) + with profiler.condition: + profiler.condition.wait(call_stack_interval_millis / 1e3) - _profiler.batch_processor.shutdown() - with _profiler.condition: - _profiler.condition.notify_all() + profiler.logger_provider.shutdown() + with profiler.condition: + profiler.condition.notify_all() def _wrapped_context_attach(wrapped, _instance, args, kwargs): @@ -311,9 +313,16 @@ def _wrapped_context_detach(wrapped, _instance, args, kwargs): return wrapped(*args, **kwargs) +def _start_profiler_thread(): + profiler_thread = threading.Thread( + name="splunk-otel-profiler", target=_profiler_loop, args=[_profiler], daemon=True + ) + profiler_thread.start() + + def _force_flush(): - if _profiler.batch_processor: - _profiler.batch_processor.force_flush() + if _profiler.logger_provider: + _profiler.logger_provider.force_flush() def _start_profiling(options): @@ -321,7 +330,12 @@ def _start_profiling(options): logger.warning("profiler already running") return + _profiler.options = options + _profiler.logger_provider = LoggerProvider(resource=options.resource) + _profiler.batch_processor = BatchLogRecordProcessor(OTLPLogExporter(options.endpoint)) + _profiler.logger_provider.add_log_record_processor(_profiler.batch_processor) _profiler.running = True + logger.debug( "starting profiling call_stack_interval=%s endpoint=%s", options.call_stack_interval, @@ -330,10 +344,9 @@ def _start_profiling(options): wrapt.wrap_function_wrapper(opentelemetry.context, "attach", _wrapped_context_attach) wrapt.wrap_function_wrapper(opentelemetry.context, "detach", _wrapped_context_detach) - profiler_thread = threading.Thread( - name="splunk-otel-profiler", target=_profiler_loop, args=[options], daemon=True - ) - profiler_thread.start() + os.register_at_fork(after_in_child=_start_profiler_thread) + + _start_profiler_thread() def start_profiling( From 8c278eafd91a41e407c28c674490ded2f393506e Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Tue, 7 Nov 2023 23:51:40 +0200 Subject: [PATCH 12/29] chore: get rid of unnecessary imports --- splunk_otel/profiling/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index d2d0b527..a9675cd8 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -26,12 +26,11 @@ import opentelemetry.context import wrapt from opentelemetry._logs import SeverityNumber -from opentelemetry.context import Context, attach, detach +from opentelemetry.context import Context from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.instrumentation.utils import unwrap -from opentelemetry.sdk._logs import LogData, LoggerProvider, LogRecord +from opentelemetry.sdk._logs import LoggerProvider, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor -from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.trace import TraceFlags from opentelemetry.trace.propagation import _SPAN_KEY From 2a0dfe00000988942057295d963db3470f468c98 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 10:25:54 +0200 Subject: [PATCH 13/29] stacktrace building without line lookups --- splunk_otel/profiling/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index a9675cd8..ce2143c4 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -21,6 +21,7 @@ import time import traceback from collections import OrderedDict +from traceback import StackSummary from typing import Dict, Optional, Union import opentelemetry.context @@ -198,6 +199,14 @@ def _get_ignored_thread_ids(batch_processor, include_internal_stacks=False): return ignored_ids +def extract_stack(frame): + stack = StackSummary.extract( + traceback.walk_stack(frame), limit=None, lookup_lines=False + ) + stack.reverse() + return stack + + def _collect_stacktraces(ignored_thread_ids): stacktraces = [] @@ -208,8 +217,7 @@ def _collect_stacktraces(ignored_thread_ids): continue stacktrace_frames = [] - # TODO: This is potentially really slow due to code line lookups in the file - stack = traceback.extract_stack(frame) + stack = extract_stack(frame) for sf in stack: stacktrace_frames.append((sf.filename, sf.name, sf.lineno)) stacktrace = { From e12c363fbb004635e1a4b7b3ed333211d19a3dab Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 10:26:41 +0200 Subject: [PATCH 14/29] account for processing time when scheduling next sampler sleep --- splunk_otel/profiling/__init__.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index ce2143c4..8a7a1b0f 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -262,8 +262,12 @@ def _profiler_loop(profiler: Profiler): profiling_logger = profiler.logger_provider.get_logger("otel.profiling", "0.1.0") + call_stack_interval_seconds = call_stack_interval_millis / 1e3 + # In case the processing takes more than the given interval, default to the smallest allowed interval + min_call_stack_interval_seconds = 1 / 1e3 while profiler.running: - timestamp = int(time.time() * 1e9) + time_begin = time.time() + timestamp = int(time_begin * 1e9) stacktraces = _collect_stacktraces(ignored_thread_ids) @@ -273,8 +277,13 @@ def _profiler_loop(profiler: Profiler): profiling_logger.emit(log_record) + processing_time = time.time() - time_begin + wait_for = max( + call_stack_interval_seconds - processing_time, min_call_stack_interval_seconds + ) + with profiler.condition: - profiler.condition.wait(call_stack_interval_millis / 1e3) + profiler.condition.wait(wait_for) profiler.logger_provider.shutdown() with profiler.condition: From 0823889ef1ef9aeb5e2bde02e4e40e8bd251bc0b Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 10:34:28 +0200 Subject: [PATCH 15/29] remove debug printout --- splunk_otel/distro.py | 1 - 1 file changed, 1 deletion(-) diff --git a/splunk_otel/distro.py b/splunk_otel/distro.py index e58d5631..bb4444e0 100644 --- a/splunk_otel/distro.py +++ b/splunk_otel/distro.py @@ -33,7 +33,6 @@ class _SplunkDistro(BaseDistro): def __init__(self): tracing_enabled = os.environ.get("OTEL_TRACE_ENABLED", True) profiling_enabled = os.environ.get("SPLUNK_PROFILER_ENABLED", False) - print("Profiling", profiling_enabled) self._tracing_enabled = _is_truthy(tracing_enabled) self._profiling_enabled = _is_truthy(profiling_enabled) if not self._tracing_enabled: From 1c0ef191441efea05392c66f468a37fb386da9ef Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 11:37:59 +0200 Subject: [PATCH 16/29] tear down profiler thread on exit --- splunk_otel/profiling/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 8a7a1b0f..625c57bb 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import atexit import base64 import gzip import logging @@ -361,6 +362,7 @@ def _start_profiling(options): wrapt.wrap_function_wrapper(opentelemetry.context, "detach", _wrapped_context_detach) os.register_at_fork(after_in_child=_start_profiler_thread) + atexit.register(stop_profiling) _start_profiler_thread() From 2f352baa3a66937065a38e500f8869cca080dacd Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 13:16:57 +0200 Subject: [PATCH 17/29] hack around process hang on exit --- splunk_otel/profiling/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 625c57bb..8ad7ac87 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -48,6 +48,7 @@ def __init__(self): self.condition = threading.Condition(threading.Lock()) self.running = False self.thread_states = {} + self.exporter = None self.logger_provider = None self.batch_processor = None self.options = None @@ -349,7 +350,8 @@ def _start_profiling(options): _profiler.options = options _profiler.logger_provider = LoggerProvider(resource=options.resource) - _profiler.batch_processor = BatchLogRecordProcessor(OTLPLogExporter(options.endpoint)) + _profiler.exporter = OTLPLogExporter(options.endpoint) + _profiler.batch_processor = BatchLogRecordProcessor(_profiler.exporter) _profiler.logger_provider.add_log_record_processor(_profiler.batch_processor) _profiler.running = True @@ -386,6 +388,9 @@ def stop_profiling(): return _profiler.running = False + # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable. + # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 + _profiler.exporter.shutdown(timeout_millis=0) with _profiler.condition: # Wake up the profiler thread _profiler.condition.notify_all() From b4f5db1dfc83f03e24a2db3574b43f43541400d5 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 17:13:39 +0200 Subject: [PATCH 18/29] hack around shutdown --- splunk_otel/profiling/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 8ad7ac87..5be32d8b 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -287,6 +287,10 @@ def _profiler_loop(profiler: Profiler): with profiler.condition: profiler.condition.wait(wait_for) + # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable + # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 + # pylint: disable-next=protected-access + _profiler.exporter._shutdown = True profiler.logger_provider.shutdown() with profiler.condition: profiler.condition.notify_all() @@ -388,9 +392,6 @@ def stop_profiling(): return _profiler.running = False - # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable. - # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 - _profiler.exporter.shutdown(timeout_millis=0) with _profiler.condition: # Wake up the profiler thread _profiler.condition.notify_all() From 7fcf3047557c63321762ad8ebb796139322ed9e6 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 17:26:41 +0200 Subject: [PATCH 19/29] get rid of logger provider dude to windows hangs --- splunk_otel/profiling/__init__.py | 53 +++++++++++++++---------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 5be32d8b..32d99033 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -31,8 +31,9 @@ from opentelemetry.context import Context from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.instrumentation.utils import unwrap -from opentelemetry.sdk._logs import LoggerProvider, LogRecord +from opentelemetry.sdk._logs import LogData, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.trace import TraceFlags from opentelemetry.trace.propagation import _SPAN_KEY @@ -49,7 +50,6 @@ def __init__(self): self.running = False self.thread_states = {} self.exporter = None - self.logger_provider = None self.batch_processor = None self.options = None @@ -230,27 +230,28 @@ def _collect_stacktraces(ignored_thread_ids): return stacktraces -def _to_log_record( - stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource -): +def _to_log_data(stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource): encoded_profile = base64.b64encode( _encode_cpu_profile(stacktraces, timestamp_unix_nanos, call_stack_interval_millis) ).decode() - return LogRecord( - timestamp=timestamp_unix_nanos, - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=resource, - attributes={ - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(stacktraces), - }, + return LogData( + LogRecord( + timestamp=timestamp_unix_nanos, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(stacktraces), + }, + ), + instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), ) @@ -262,8 +263,6 @@ def _profiler_loop(profiler: Profiler): profiler.batch_processor, include_internal_stacks=options.include_internal_stacks ) - profiling_logger = profiler.logger_provider.get_logger("otel.profiling", "0.1.0") - call_stack_interval_seconds = call_stack_interval_millis / 1e3 # In case the processing takes more than the given interval, default to the smallest allowed interval min_call_stack_interval_seconds = 1 / 1e3 @@ -273,11 +272,11 @@ def _profiler_loop(profiler: Profiler): stacktraces = _collect_stacktraces(ignored_thread_ids) - log_record = _to_log_record( + log_data = _to_log_data( stacktraces, timestamp, call_stack_interval_millis, options.resource ) - profiling_logger.emit(log_record) + profiler.batch_processor.emit(log_data) processing_time = time.time() - time_begin wait_for = max( @@ -291,7 +290,7 @@ def _profiler_loop(profiler: Profiler): # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 # pylint: disable-next=protected-access _profiler.exporter._shutdown = True - profiler.logger_provider.shutdown() + profiler.batch_processor.shutdown() with profiler.condition: profiler.condition.notify_all() @@ -343,8 +342,8 @@ def _start_profiler_thread(): def _force_flush(): - if _profiler.logger_provider: - _profiler.logger_provider.force_flush() + if _profiler.batch_processor: + _profiler.batch_processor.force_flush() def _start_profiling(options): @@ -353,10 +352,8 @@ def _start_profiling(options): return _profiler.options = options - _profiler.logger_provider = LoggerProvider(resource=options.resource) _profiler.exporter = OTLPLogExporter(options.endpoint) _profiler.batch_processor = BatchLogRecordProcessor(_profiler.exporter) - _profiler.logger_provider.add_log_record_processor(_profiler.batch_processor) _profiler.running = True logger.debug( From e0ed42a3b6378abb3842408ba418aac08466f9f0 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 18:22:29 +0200 Subject: [PATCH 20/29] add check for register_at_fork --- splunk_otel/profiling/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 32d99033..533a7657 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -364,7 +364,9 @@ def _start_profiling(options): wrapt.wrap_function_wrapper(opentelemetry.context, "attach", _wrapped_context_attach) wrapt.wrap_function_wrapper(opentelemetry.context, "detach", _wrapped_context_detach) - os.register_at_fork(after_in_child=_start_profiler_thread) + # Windows does not have register_at_fork + if hasattr(os, "register_at_fork"): + os.register_at_fork(after_in_child=_start_profiler_thread) atexit.register(stop_profiling) _start_profiler_thread() From 858d80e25f7ceeb4e11f128957ad3897c3735cde Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Wed, 8 Nov 2023 18:28:24 +0200 Subject: [PATCH 21/29] readd logger provider --- splunk_otel/profiling/__init__.py | 54 +++++++++++++++++-------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 533a7657..2bcedbd3 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -31,7 +31,7 @@ from opentelemetry.context import Context from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.instrumentation.utils import unwrap -from opentelemetry.sdk._logs import LogData, LogRecord +from opentelemetry.sdk._logs import LoggerProvider, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.trace import TraceFlags @@ -49,6 +49,7 @@ def __init__(self): self.condition = threading.Condition(threading.Lock()) self.running = False self.thread_states = {} + self.logger_provider = None self.exporter = None self.batch_processor = None self.options = None @@ -230,28 +231,27 @@ def _collect_stacktraces(ignored_thread_ids): return stacktraces -def _to_log_data(stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource): +def _to_log_record( + stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource +): encoded_profile = base64.b64encode( _encode_cpu_profile(stacktraces, timestamp_unix_nanos, call_stack_interval_millis) ).decode() - return LogData( - LogRecord( - timestamp=timestamp_unix_nanos, - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=resource, - attributes={ - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(stacktraces), - }, - ), - instrumentation_scope=InstrumentationScope("otel.profiling", "0.1.0"), + return LogRecord( + timestamp=timestamp_unix_nanos, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(stacktraces), + }, ) @@ -263,6 +263,8 @@ def _profiler_loop(profiler: Profiler): profiler.batch_processor, include_internal_stacks=options.include_internal_stacks ) + profiling_logger = profiler.logger_provider.get_logger("otel.profiling", "0.1.0") + call_stack_interval_seconds = call_stack_interval_millis / 1e3 # In case the processing takes more than the given interval, default to the smallest allowed interval min_call_stack_interval_seconds = 1 / 1e3 @@ -272,11 +274,11 @@ def _profiler_loop(profiler: Profiler): stacktraces = _collect_stacktraces(ignored_thread_ids) - log_data = _to_log_data( + log_record = _to_log_record( stacktraces, timestamp, call_stack_interval_millis, options.resource ) - profiler.batch_processor.emit(log_data) + profiling_logger.emit(log_record) processing_time = time.time() - time_begin wait_for = max( @@ -289,8 +291,8 @@ def _profiler_loop(profiler: Profiler): # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 # pylint: disable-next=protected-access - _profiler.exporter._shutdown = True - profiler.batch_processor.shutdown() + profiler.exporter._shutdown = True + profiler.logger_provider.shutdown() with profiler.condition: profiler.condition.notify_all() @@ -342,8 +344,8 @@ def _start_profiler_thread(): def _force_flush(): - if _profiler.batch_processor: - _profiler.batch_processor.force_flush() + if _profiler.logger_provider: + _profiler.logger_provider.force_flush() def _start_profiling(options): @@ -352,8 +354,10 @@ def _start_profiling(options): return _profiler.options = options + _profiler.logger_provider = LoggerProvider(resource=options.resource) _profiler.exporter = OTLPLogExporter(options.endpoint) _profiler.batch_processor = BatchLogRecordProcessor(_profiler.exporter) + _profiler.logger_provider.add_log_record_processor(_profiler.batch_processor) _profiler.running = True logger.debug( From b437ec4b247f06226af18f8bc55a1f116ce81775 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Thu, 9 Nov 2023 17:53:59 +0200 Subject: [PATCH 22/29] add comment about trace context to profiling tests --- tests/unit/test_profiling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_profiling.py b/tests/unit/test_profiling.py index afd65128..ff2abbff 100644 --- a/tests/unit/test_profiling.py +++ b/tests/unit/test_profiling.py @@ -72,6 +72,8 @@ def log_record_to_profile(log_record): class TestProfiling(unittest.TestCase): @classmethod def setUpClass(cls): + # Tracing is set up to correlate active spans with stacktraces. + # When the profiler takes a sample while a span is active, it stores the trace context into the stacktrace. provider = TracerProvider() processor = SimpleSpanProcessor(ConsoleSpanExporter()) provider.add_span_processor(processor) From 463699d35a489c8abce0c8caa10c48454bd4ebcf Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Thu, 9 Nov 2023 21:22:17 +0200 Subject: [PATCH 23/29] rename call_stack_interval to call_stack_interval_millis --- splunk_otel/profiling/__init__.py | 10 +++++----- splunk_otel/profiling/options.py | 21 +++++++++++---------- tests/unit/test_profiling.py | 4 ++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 2bcedbd3..2d4f746b 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -257,7 +257,7 @@ def _to_log_record( def _profiler_loop(profiler: Profiler): options = profiler.options - call_stack_interval_millis = options.call_stack_interval + call_stack_interval_millis = options.call_stack_interval_millis ignored_thread_ids = _get_ignored_thread_ids( profiler.batch_processor, include_internal_stacks=options.include_internal_stacks @@ -361,8 +361,8 @@ def _start_profiling(options): _profiler.running = True logger.debug( - "starting profiling call_stack_interval=%s endpoint=%s", - options.call_stack_interval, + "starting profiling call_stack_interval_millis=%s endpoint=%s", + options.call_stack_interval_millis, options.endpoint, ) wrapt.wrap_function_wrapper(opentelemetry.context, "attach", _wrapped_context_attach) @@ -380,13 +380,13 @@ def start_profiling( service_name: Optional[str] = None, resource_attributes: Optional[Dict[str, Union[str, bool, int, float]]] = None, endpoint: Optional[str] = None, - call_stack_interval: Optional[int] = None, + call_stack_interval_millis: Optional[int] = None, ): # pylint: disable-next=protected-access resource = splunk_otel.options._Options._get_resource( service_name, resource_attributes ) - options = _Options(resource, endpoint, call_stack_interval) + options = _Options(resource, endpoint, call_stack_interval_millis) _start_profiling(options) diff --git a/splunk_otel/profiling/options.py b/splunk_otel/profiling/options.py index 0ef32510..e70669dc 100644 --- a/splunk_otel/profiling/options.py +++ b/splunk_otel/profiling/options.py @@ -21,8 +21,7 @@ from splunk_otel.util import _is_truthy logger = logging.getLogger(__name__) -_DEFAULT_CALL_STACK_INTERVAL = 1_000 -_DEFAULT_INCLUDE_INTERNAL_STACKS = False +_DEFAULT_CALL_STACK_INTERVAL_MILLIS = 1_000 def _sanitize_interval(interval): @@ -31,34 +30,36 @@ def _sanitize_interval(interval): logger.warning( "call stack interval has to be positive, got %s, defaulting to %s", interval, - _DEFAULT_CALL_STACK_INTERVAL, + _DEFAULT_CALL_STACK_INTERVAL_MILLIS, ) - return _DEFAULT_CALL_STACK_INTERVAL + return _DEFAULT_CALL_STACK_INTERVAL_MILLIS return interval logger.warning( "call stack interval not an integer, defaulting to %s", - _DEFAULT_CALL_STACK_INTERVAL, + _DEFAULT_CALL_STACK_INTERVAL_MILLIS, ) - return _DEFAULT_CALL_STACK_INTERVAL + return _DEFAULT_CALL_STACK_INTERVAL_MILLIS class _Options: resource: Resource endpoint: str - call_stack_interval: int + call_stack_interval_millis: int def __init__( self, resource: Resource, endpoint: Optional[str] = None, - call_stack_interval: Optional[int] = None, + call_stack_interval_millis: Optional[int] = None, include_internal_stacks: Optional[bool] = None, ): self.resource = resource self.endpoint = _Options._get_endpoint(endpoint) - self.call_stack_interval = _Options._get_call_stack_interval(call_stack_interval) + self.call_stack_interval_millis = _Options._get_call_stack_interval( + call_stack_interval_millis + ) self.include_internal_stacks = _Options._include_internal_stacks( include_internal_stacks ) @@ -81,7 +82,7 @@ def _get_call_stack_interval(interval: Optional[int]) -> int: if interval: return _sanitize_interval(int(interval)) - return _DEFAULT_CALL_STACK_INTERVAL + return _DEFAULT_CALL_STACK_INTERVAL_MILLIS return _sanitize_interval(interval) diff --git a/tests/unit/test_profiling.py b/tests/unit/test_profiling.py index ff2abbff..f6c69594 100644 --- a/tests/unit/test_profiling.py +++ b/tests/unit/test_profiling.py @@ -93,7 +93,7 @@ def tearDown(self): def profile_capture_thread_ids(self): start_profiling( service_name="prof-thread-filter", - call_stack_interval=10, + call_stack_interval_millis=10, ) do_work(100) @@ -180,7 +180,7 @@ def test_profiling_export(self): start_profiling( service_name="prof-export-test", resource_attributes={"foo": "bar"}, - call_stack_interval=100, + call_stack_interval_millis=100, ) with tracer.start_as_current_span("add-some-numbers") as span: From 68bf085b609c17d2760eaf82ff4623bdc8f7ca52 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Thu, 9 Nov 2023 21:52:34 +0200 Subject: [PATCH 24/29] remove unused import --- splunk_otel/profiling/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 2d4f746b..42ba73c3 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -33,7 +33,6 @@ from opentelemetry.instrumentation.utils import unwrap from opentelemetry.sdk._logs import LoggerProvider, LogRecord from opentelemetry.sdk._logs.export import BatchLogRecordProcessor -from opentelemetry.sdk.util.instrumentation import InstrumentationScope from opentelemetry.trace import TraceFlags from opentelemetry.trace.propagation import _SPAN_KEY From 6956fc6457d281451c261fdac4bd406f8531d5cb Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 13 Nov 2023 13:00:07 +0200 Subject: [PATCH 25/29] pass explicit thread_states to to_log_record / encode_cpu_profile --- splunk_otel/profiling/__init__.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 42ba73c3..92686d74 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -75,7 +75,7 @@ def keys(self): return list(self.strings.keys()) -def _encode_cpu_profile(stacktraces, timestamp_unix_nanos, interval): +def _encode_cpu_profile(thread_states, stacktraces, timestamp_unix_nanos, interval): str_table = StringTable() locations_table = OrderedDict() functions_table = OrderedDict() @@ -142,7 +142,7 @@ def get_location(frame): labels = [timestamp_label, event_period_label, thread_id_label] - trace_context = _profiler.thread_states.get(thread_id) + trace_context = thread_states.get(thread_id) if trace_context: (trace_id, span_id) = trace_context @@ -231,10 +231,12 @@ def _collect_stacktraces(ignored_thread_ids): def _to_log_record( - stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource + thread_states, stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource ): encoded_profile = base64.b64encode( - _encode_cpu_profile(stacktraces, timestamp_unix_nanos, call_stack_interval_millis) + _encode_cpu_profile( + thread_states, stacktraces, timestamp_unix_nanos, call_stack_interval_millis + ) ).decode() return LogRecord( @@ -274,7 +276,11 @@ def _profiler_loop(profiler: Profiler): stacktraces = _collect_stacktraces(ignored_thread_ids) log_record = _to_log_record( - stacktraces, timestamp, call_stack_interval_millis, options.resource + profiler.thread_states, + stacktraces, + timestamp, + call_stack_interval_millis, + options.resource, ) profiling_logger.emit(log_record) From 78c78d574ee7d70d9387e77d4f062761f21ddc41 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 13 Nov 2023 13:14:30 +0200 Subject: [PATCH 26/29] remove access to _profiler from _start_profiler_thread --- splunk_otel/profiling/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 92686d74..fba786d8 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -341,9 +341,9 @@ def _wrapped_context_detach(wrapped, _instance, args, kwargs): return wrapped(*args, **kwargs) -def _start_profiler_thread(): +def _start_profiler_thread(profiler): profiler_thread = threading.Thread( - name="splunk-otel-profiler", target=_profiler_loop, args=[_profiler], daemon=True + name="splunk-otel-profiler", target=_profiler_loop, args=[profiler], daemon=True ) profiler_thread.start() @@ -375,10 +375,10 @@ def _start_profiling(options): # Windows does not have register_at_fork if hasattr(os, "register_at_fork"): - os.register_at_fork(after_in_child=_start_profiler_thread) + os.register_at_fork(after_in_child=lambda: _start_profiler_thread(_profiler)) atexit.register(stop_profiling) - _start_profiler_thread() + _start_profiler_thread(_profiler) def start_profiling( From 2e388e70f3c77164f883839d3a5abea287fecaf2 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Mon, 13 Nov 2023 14:14:38 +0200 Subject: [PATCH 27/29] wrap context hooks --- splunk_otel/profiling/__init__.py | 70 ++++++++++++++++++------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index fba786d8..473f44a9 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -302,43 +302,49 @@ def _profiler_loop(profiler: Profiler): profiler.condition.notify_all() -def _wrapped_context_attach(wrapped, _instance, args, kwargs): - token = wrapped(*args, **kwargs) +def _make_wrapped_context_attach(profiler): + def _wrapped_context_attach(wrapped, _instance, args, kwargs): + token = wrapped(*args, **kwargs) - maybe_context = args[0] if args else None + maybe_context = args[0] if args else None - if maybe_context: - span = maybe_context.get(_SPAN_KEY) + if maybe_context: + span = maybe_context.get(_SPAN_KEY) - if span: - thread_id = threading.get_ident() - _profiler.thread_states[thread_id] = ( - span.context.trace_id, - span.context.span_id, - ) + if span: + thread_id = threading.get_ident() + profiler.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) - return token + return token + return _wrapped_context_attach -def _wrapped_context_detach(wrapped, _instance, args, kwargs): - token = args[0] if args else None - if token: - prev = token.old_value - thread_id = threading.get_ident() - if isinstance(prev, Context): - span = prev.get(_SPAN_KEY) +def _make_wrapped_context_detach(profiler): + def _wrapped_context_detach(wrapped, _instance, args, kwargs): + token = args[0] if args else None - if span: - _profiler.thread_states[thread_id] = ( - span.context.trace_id, - span.context.span_id, - ) + if token: + prev = token.old_value + thread_id = threading.get_ident() + if isinstance(prev, Context): + span = prev.get(_SPAN_KEY) + + if span: + profiler.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) + else: + profiler.thread_states[thread_id] = None else: - _profiler.thread_states[thread_id] = None - else: - _profiler.thread_states[thread_id] = None - return wrapped(*args, **kwargs) + profiler.thread_states[thread_id] = None + return wrapped(*args, **kwargs) + + return _wrapped_context_detach def _start_profiler_thread(profiler): @@ -370,8 +376,12 @@ def _start_profiling(options): options.call_stack_interval_millis, options.endpoint, ) - wrapt.wrap_function_wrapper(opentelemetry.context, "attach", _wrapped_context_attach) - wrapt.wrap_function_wrapper(opentelemetry.context, "detach", _wrapped_context_detach) + wrapt.wrap_function_wrapper( + opentelemetry.context, "attach", _make_wrapped_context_attach(_profiler) + ) + wrapt.wrap_function_wrapper( + opentelemetry.context, "detach", _make_wrapped_context_detach(_profiler) + ) # Windows does not have register_at_fork if hasattr(os, "register_at_fork"): From cf0adf28ebea4f0a8c1554fa1ddb00c6e9905b4a Mon Sep 17 00:00:00 2001 From: Pablo Collins Date: Tue, 14 Nov 2023 11:07:43 -0500 Subject: [PATCH 28/29] refactor Profiler into a class with methods (#355) --- splunk_otel/profiling/__init__.py | 495 +++++++++++++++--------------- 1 file changed, 250 insertions(+), 245 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 473f44a9..944e2bf0 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -44,161 +44,285 @@ class Profiler: + def __init__(self): self.condition = threading.Condition(threading.Lock()) - self.running = False self.thread_states = {} + + self.resource = None + self.call_stack_interval_millis = None + self.include_internal_stacks = None + self.running = False + self.logger_provider = None self.exporter = None self.batch_processor = None - self.options = None + def setup(self, resource, endpoint, call_stack_interval_millis, include_internal_stacks): + self.resource = resource + self.call_stack_interval_millis = call_stack_interval_millis + self.include_internal_stacks = include_internal_stacks + self.running = True -_profiler = Profiler() + self.exporter = OTLPLogExporter(endpoint) + self.batch_processor = BatchLogRecordProcessor(self.exporter) + self.logger_provider = LoggerProvider(resource=resource) + self.logger_provider.add_log_record_processor(self.batch_processor) + def loop(self): + ignored_thread_ids = self.get_ignored_thread_ids() -class StringTable: - def __init__(self): - self.strings = OrderedDict() + profiling_logger = self.logger_provider.get_logger("otel.profiling", "0.1.0") - def index(self, token): - idx = self.strings.get(token) + call_stack_interval_seconds = self.call_stack_interval_millis / 1e3 + # In case the processing takes more than the given interval, default to the smallest allowed interval + min_call_stack_interval_seconds = 1 / 1e3 + while self.is_running(): + time_begin = time.time() + timestamp = int(time_begin * 1e9) - if idx: - return idx + stacktraces = _collect_stacktraces(ignored_thread_ids) - idx = len(self.strings) - self.strings[token] = idx - return idx + log_record = self.to_log_record( + stacktraces, + timestamp, + ) - def keys(self): - return list(self.strings.keys()) + profiling_logger.emit(log_record) + processing_time = time.time() - time_begin + wait_for = max( + call_stack_interval_seconds - processing_time, min_call_stack_interval_seconds + ) -def _encode_cpu_profile(thread_states, stacktraces, timestamp_unix_nanos, interval): - str_table = StringTable() - locations_table = OrderedDict() - functions_table = OrderedDict() + with self.condition: + self.condition.wait(wait_for) - def get_function(file_name, function_name): - key = f"{file_name}:{function_name}" - fun = functions_table.get(key) + # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable + # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 + # pylint: disable-next=protected-access + self.exporter._shutdown = True + self.logger_provider.shutdown() + with self.condition: + self.condition.notify_all() + + def get_ignored_thread_ids(self): + if self.include_internal_stacks: + return [] + + ignored_ids = [threading.get_ident()] + + log_processor_thread_id = self.extract_log_processor_thread_id() + + if log_processor_thread_id: + ignored_ids.append(log_processor_thread_id) + + return ignored_ids + + def extract_log_processor_thread_id(self): + if hasattr(self.batch_processor, "_worker_thread"): + # pylint: disable-next=protected-access + worker_thread = self.batch_processor._worker_thread + + if isinstance(worker_thread, threading.Thread): + return worker_thread.ident + + return None + + def to_log_record(self, stacktraces, timestamp_unix_nanos): + encoded = self.get_cpu_profile(stacktraces, timestamp_unix_nanos) + encoded_profile = base64.b64encode(encoded).decode() + + return LogRecord( + timestamp=timestamp_unix_nanos, + trace_id=0, + span_id=0, + trace_flags=TraceFlags(0x01), + severity_number=SeverityNumber.UNSPECIFIED, + body=encoded_profile, + resource=self.resource, + attributes={ + "profiling.data.format": "pprof-gzip-base64", + "profiling.data.type": "cpu", + "com.splunk.sourcetype": "otel.profiling", + "profiling.data.total.frame.count": len(stacktraces), + }, + ) - if fun is None: - name_id = str_table.index(function_name) - fun = profile_pb2.Function() - fun.id = len(functions_table) + 1 - fun.name = name_id - fun.system_name = name_id - fun.filename = str_table.index(file_name) - functions_table[key] = fun + def get_cpu_profile(self, stacktraces, timestamp_unix_nanos): + str_table = StringTable() + locations_table = OrderedDict() + functions_table = OrderedDict() - return fun + def get_function(file_name, function_name): + key = f"{file_name}:{function_name}" + fun = functions_table.get(key) - def get_line(file_name, function_name, line_no): - line = profile_pb2.Line() - line.function_id = get_function(file_name, function_name).id - line.line = line_no if line_no != 0 else -1 - return line + if fun is None: + name_id = str_table.index(function_name) + fun = profile_pb2.Function() + fun.id = len(functions_table) + 1 + fun.name = name_id + fun.system_name = name_id + fun.filename = str_table.index(file_name) + functions_table[key] = fun - def get_location(frame): - (file_name, function_name, line_no) = frame - key = f"{file_name}:{function_name}:{line_no}" - location = locations_table.get(key) + return fun - if location is None: - location = profile_pb2.Location() - location.id = len(locations_table) + 1 - location.line.append(get_line(file_name, function_name, line_no)) - locations_table[key] = location + def get_line(file_name, function_name, line_no): + line = profile_pb2.Line() + line.function_id = get_function(file_name, function_name).id + line.line = line_no if line_no != 0 else -1 + return line - return location + def get_location(frame): + (file_name, function_name, line_no) = frame + key = f"{file_name}:{function_name}:{line_no}" + location = locations_table.get(key) - timestamp_unix_millis = int(timestamp_unix_nanos / 1e6) + if location is None: + location = profile_pb2.Location() + location.id = len(locations_table) + 1 + location.line.append(get_line(file_name, function_name, line_no)) + locations_table[key] = location - timestamp_key = str_table.index("source.event.time") - trace_id_key = str_table.index("trace_id") - span_id_key = str_table.index("span_id") - thread_id_key = str_table.index("thread.id") - event_period_key = str_table.index("source.event.period") + return location - pb_profile = profile_pb2.Profile() + timestamp_unix_millis = int(timestamp_unix_nanos / 1e6) - event_period_label = profile_pb2.Label() - event_period_label.key = event_period_key - event_period_label.num = interval + timestamp_key = str_table.index("source.event.time") + trace_id_key = str_table.index("trace_id") + span_id_key = str_table.index("span_id") + thread_id_key = str_table.index("thread.id") + event_period_key = str_table.index("source.event.period") - samples = [] - for stacktrace in stacktraces: - thread_id = stacktrace["tid"] + pb_profile = profile_pb2.Profile() - timestamp_label = profile_pb2.Label() - timestamp_label.key = timestamp_key - timestamp_label.num = timestamp_unix_millis + event_period_label = profile_pb2.Label() + event_period_label.key = event_period_key + event_period_label.num = self.call_stack_interval_millis - thread_id_label = profile_pb2.Label() - thread_id_label.key = thread_id_key - thread_id_label.num = thread_id + samples = [] + for stacktrace in stacktraces: + thread_id = stacktrace["tid"] - labels = [timestamp_label, event_period_label, thread_id_label] + timestamp_label = profile_pb2.Label() + timestamp_label.key = timestamp_key + timestamp_label.num = timestamp_unix_millis - trace_context = thread_states.get(thread_id) - if trace_context: - (trace_id, span_id) = trace_context + thread_id_label = profile_pb2.Label() + thread_id_label.key = thread_id_key + thread_id_label.num = thread_id - trace_id_label = profile_pb2.Label() - trace_id_label.key = trace_id_key - trace_id_label.str = str_table.index(f"{trace_id:#016x}") - labels.append(trace_id_label) + labels = [timestamp_label, event_period_label, thread_id_label] - span_id_label = profile_pb2.Label() - span_id_label.key = span_id_key - span_id_label.str = str_table.index(f"{span_id:#08x}") - labels.append(span_id_label) + trace_context = self.thread_states.get(thread_id) + if trace_context: + (trace_id, span_id) = trace_context - sample = profile_pb2.Sample() + trace_id_label = profile_pb2.Label() + trace_id_label.key = trace_id_key + trace_id_label.str = str_table.index(f"{trace_id:#016x}") + labels.append(trace_id_label) - location_ids = [] + span_id_label = profile_pb2.Label() + span_id_label.key = span_id_key + span_id_label.str = str_table.index(f"{span_id:#08x}") + labels.append(span_id_label) - for frame in reversed(stacktrace["stacktrace"]): - location_ids.append(get_location(frame).id) + sample = profile_pb2.Sample() - sample.location_id.extend(location_ids) - sample.label.extend(labels) + location_ids = [] - samples.append(sample) + for frame in reversed(stacktrace["stacktrace"]): + location_ids.append(get_location(frame).id) - pb_profile.sample.extend(samples) - pb_profile.string_table.extend(str_table.keys()) - pb_profile.function.extend(list(functions_table.values())) - pb_profile.location.extend(list(locations_table.values())) + sample.location_id.extend(location_ids) + sample.label.extend(labels) - return gzip.compress(pb_profile.SerializeToString()) + samples.append(sample) + pb_profile.sample.extend(samples) + pb_profile.string_table.extend(str_table.keys()) + pb_profile.function.extend(list(functions_table.values())) + pb_profile.location.extend(list(locations_table.values())) -def _extract_log_processor_thread_id(processor): - if hasattr(processor, "_worker_thread"): - # pylint: disable-next=protected-access - worker_thread = processor._worker_thread + return gzip.compress(pb_profile.SerializeToString()) + + def is_running(self): + return self.running + + def force_flush(self): + if self.logger_provider: + self.logger_provider.force_flush() + + def make_wrapped_context_attach(self): + def _wrapped_context_attach(wrapped, _instance, args, kwargs): + token = wrapped(*args, **kwargs) - if isinstance(worker_thread, threading.Thread): - return worker_thread.ident + maybe_context = args[0] if args else None - return None + if maybe_context: + span = maybe_context.get(_SPAN_KEY) + + if span: + thread_id = threading.get_ident() + self.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) + + return token + + return _wrapped_context_attach + + def make_wrapped_context_detach(self): + def _wrapped_context_detach(wrapped, _instance, args, kwargs): + token = args[0] if args else None + + if token: + prev = token.old_value + thread_id = threading.get_ident() + if isinstance(prev, Context): + span = prev.get(_SPAN_KEY) + + if span: + self.thread_states[thread_id] = ( + span.context.trace_id, + span.context.span_id, + ) + else: + self.thread_states[thread_id] = None + else: + self.thread_states[thread_id] = None + return wrapped(*args, **kwargs) + + return _wrapped_context_detach -def _get_ignored_thread_ids(batch_processor, include_internal_stacks=False): - if include_internal_stacks: - return [] +_profiler = Profiler() + + +def get_profiler(): + return _profiler + + +class StringTable: + def __init__(self): + self.strings = OrderedDict() - ignored_ids = [threading.get_ident()] + def index(self, token): + idx = self.strings.get(token) - log_processor_thread_id = _extract_log_processor_thread_id(batch_processor) + if idx: + return idx - if log_processor_thread_id: - ignored_ids.append(log_processor_thread_id) + idx = len(self.strings) + self.strings[token] = idx + return idx - return ignored_ids + def keys(self): + return list(self.strings.keys()) def extract_stack(frame): @@ -230,165 +354,45 @@ def _collect_stacktraces(ignored_thread_ids): return stacktraces -def _to_log_record( - thread_states, stacktraces, timestamp_unix_nanos, call_stack_interval_millis, resource -): - encoded_profile = base64.b64encode( - _encode_cpu_profile( - thread_states, stacktraces, timestamp_unix_nanos, call_stack_interval_millis - ) - ).decode() - - return LogRecord( - timestamp=timestamp_unix_nanos, - trace_id=0, - span_id=0, - trace_flags=TraceFlags(0x01), - severity_number=SeverityNumber.UNSPECIFIED, - body=encoded_profile, - resource=resource, - attributes={ - "profiling.data.format": "pprof-gzip-base64", - "profiling.data.type": "cpu", - "com.splunk.sourcetype": "otel.profiling", - "profiling.data.total.frame.count": len(stacktraces), - }, - ) - - -def _profiler_loop(profiler: Profiler): - options = profiler.options - call_stack_interval_millis = options.call_stack_interval_millis - - ignored_thread_ids = _get_ignored_thread_ids( - profiler.batch_processor, include_internal_stacks=options.include_internal_stacks - ) - - profiling_logger = profiler.logger_provider.get_logger("otel.profiling", "0.1.0") - - call_stack_interval_seconds = call_stack_interval_millis / 1e3 - # In case the processing takes more than the given interval, default to the smallest allowed interval - min_call_stack_interval_seconds = 1 / 1e3 - while profiler.running: - time_begin = time.time() - timestamp = int(time_begin * 1e9) - - stacktraces = _collect_stacktraces(ignored_thread_ids) - - log_record = _to_log_record( - profiler.thread_states, - stacktraces, - timestamp, - call_stack_interval_millis, - options.resource, - ) - - profiling_logger.emit(log_record) - - processing_time = time.time() - time_begin - wait_for = max( - call_stack_interval_seconds - processing_time, min_call_stack_interval_seconds - ) - - with profiler.condition: - profiler.condition.wait(wait_for) - - # Hack around batch log processor getting stuck on application exits when the profiling endpoint is not reachable - # Could be related: https://github.com/open-telemetry/opentelemetry-python/issues/2284 - # pylint: disable-next=protected-access - profiler.exporter._shutdown = True - profiler.logger_provider.shutdown() - with profiler.condition: - profiler.condition.notify_all() - - -def _make_wrapped_context_attach(profiler): - def _wrapped_context_attach(wrapped, _instance, args, kwargs): - token = wrapped(*args, **kwargs) - - maybe_context = args[0] if args else None - - if maybe_context: - span = maybe_context.get(_SPAN_KEY) - - if span: - thread_id = threading.get_ident() - profiler.thread_states[thread_id] = ( - span.context.trace_id, - span.context.span_id, - ) - - return token - - return _wrapped_context_attach - - -def _make_wrapped_context_detach(profiler): - def _wrapped_context_detach(wrapped, _instance, args, kwargs): - token = args[0] if args else None - - if token: - prev = token.old_value - thread_id = threading.get_ident() - if isinstance(prev, Context): - span = prev.get(_SPAN_KEY) - - if span: - profiler.thread_states[thread_id] = ( - span.context.trace_id, - span.context.span_id, - ) - else: - profiler.thread_states[thread_id] = None - else: - profiler.thread_states[thread_id] = None - return wrapped(*args, **kwargs) - - return _wrapped_context_detach - - def _start_profiler_thread(profiler): profiler_thread = threading.Thread( - name="splunk-otel-profiler", target=_profiler_loop, args=[profiler], daemon=True + name="splunk-otel-profiler", target=profiler.loop, daemon=True ) profiler_thread.start() def _force_flush(): - if _profiler.logger_provider: - _profiler.logger_provider.force_flush() + profiler = get_profiler() + profiler.force_flush() -def _start_profiling(options): - if _profiler.running: +def _start_profiling(opts): + profiler = get_profiler() + + if profiler.is_running(): logger.warning("profiler already running") return - _profiler.options = options - _profiler.logger_provider = LoggerProvider(resource=options.resource) - _profiler.exporter = OTLPLogExporter(options.endpoint) - _profiler.batch_processor = BatchLogRecordProcessor(_profiler.exporter) - _profiler.logger_provider.add_log_record_processor(_profiler.batch_processor) - _profiler.running = True + profiler.setup(opts.resource, opts.endpoint, opts.call_stack_interval_millis, opts.include_internal_stacks) logger.debug( "starting profiling call_stack_interval_millis=%s endpoint=%s", - options.call_stack_interval_millis, - options.endpoint, + opts.call_stack_interval_millis, + opts.endpoint, ) wrapt.wrap_function_wrapper( - opentelemetry.context, "attach", _make_wrapped_context_attach(_profiler) + opentelemetry.context, "attach", profiler.make_wrapped_context_attach() ) wrapt.wrap_function_wrapper( - opentelemetry.context, "detach", _make_wrapped_context_detach(_profiler) + opentelemetry.context, "detach", profiler.make_wrapped_context_detach() ) # Windows does not have register_at_fork if hasattr(os, "register_at_fork"): - os.register_at_fork(after_in_child=lambda: _start_profiler_thread(_profiler)) + os.register_at_fork(after_in_child=lambda: _start_profiler_thread(profiler)) atexit.register(stop_profiling) - _start_profiler_thread(_profiler) + _start_profiler_thread(profiler) def start_profiling( @@ -401,20 +405,21 @@ def start_profiling( resource = splunk_otel.options._Options._get_resource( service_name, resource_attributes ) - options = _Options(resource, endpoint, call_stack_interval_millis) - _start_profiling(options) + opts = _Options(resource, endpoint, call_stack_interval_millis) + _start_profiling(opts) def stop_profiling(): - if not _profiler.running: + profiler = get_profiler() + if not profiler.is_running(): return - _profiler.running = False - with _profiler.condition: + profiler.running = False + with profiler.condition: # Wake up the profiler thread - _profiler.condition.notify_all() + profiler.condition.notify_all() # Wait for the profiler thread to exit - _profiler.condition.wait() + profiler.condition.wait() unwrap(opentelemetry.context, "attach") unwrap(opentelemetry.context, "detach") From 4f1c0e03ab4ab8e174355e1c028486d6bd3f1cc2 Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Tue, 14 Nov 2023 19:55:48 +0200 Subject: [PATCH 29/29] lint --- splunk_otel/profiling/__init__.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/splunk_otel/profiling/__init__.py b/splunk_otel/profiling/__init__.py index 944e2bf0..37366be4 100644 --- a/splunk_otel/profiling/__init__.py +++ b/splunk_otel/profiling/__init__.py @@ -44,7 +44,6 @@ class Profiler: - def __init__(self): self.condition = threading.Condition(threading.Lock()) self.thread_states = {} @@ -58,7 +57,9 @@ def __init__(self): self.exporter = None self.batch_processor = None - def setup(self, resource, endpoint, call_stack_interval_millis, include_internal_stacks): + def setup( + self, resource, endpoint, call_stack_interval_millis, include_internal_stacks + ): self.resource = resource self.call_stack_interval_millis = call_stack_interval_millis self.include_internal_stacks = include_internal_stacks @@ -92,7 +93,8 @@ def loop(self): processing_time = time.time() - time_begin wait_for = max( - call_stack_interval_seconds - processing_time, min_call_stack_interval_seconds + call_stack_interval_seconds - processing_time, + min_call_stack_interval_seconds, ) with self.condition: @@ -373,7 +375,12 @@ def _start_profiling(opts): logger.warning("profiler already running") return - profiler.setup(opts.resource, opts.endpoint, opts.call_stack_interval_millis, opts.include_internal_stacks) + profiler.setup( + opts.resource, + opts.endpoint, + opts.call_stack_interval_millis, + opts.include_internal_stacks, + ) logger.debug( "starting profiling call_stack_interval_millis=%s endpoint=%s",