From 8b38bfd29df41bdcde607a85f3722566ca49e374 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 8 Apr 2024 20:26:54 -0700 Subject: [PATCH] add libraries updates --- docs/core/whats-new/dotnet-8/overview.md | 2 +- docs/core/whats-new/dotnet-9/libraries.md | 37 +++++++++- docs/core/whats-new/dotnet-9/overview.md | 44 ++++++++++++ docs/core/whats-new/dotnet-9/runtime.md | 2 +- .../dotnet-9/csharp/LlamaTokenizer.cs | 71 +++++++++++++++++++ .../snippets/dotnet-9/csharp/Program.cs | 1 + .../snippets/dotnet-9/csharp/Project.csproj | 5 ++ .../snippets/dotnet-9/csharp/Reflection.cs | 43 ++++++++--- .../snippets/dotnet-9/csharp/Tiktoken.cs | 65 +++++++++++++++++ .../snippets/dotnet-9/csharp/TimeSpan.cs | 17 +++++ 10 files changed, 272 insertions(+), 15 deletions(-) create mode 100644 docs/core/whats-new/snippets/dotnet-9/csharp/LlamaTokenizer.cs create mode 100644 docs/core/whats-new/snippets/dotnet-9/csharp/Tiktoken.cs create mode 100644 docs/core/whats-new/snippets/dotnet-9/csharp/TimeSpan.cs diff --git a/docs/core/whats-new/dotnet-8/overview.md b/docs/core/whats-new/dotnet-8/overview.md index 362d1cff332a7..64e575e69bf67 100644 --- a/docs/core/whats-new/dotnet-8/overview.md +++ b/docs/core/whats-new/dotnet-8/overview.md @@ -15,7 +15,7 @@ The .NET 8 runtime includes improvements to performance, garbage collection, and ## .NET SDK -For information about what's new in the .NET SDK, Native AOT, code analysis, and diagnostics, see [What's new in the SDK and tooling for .NET 8](sdk.md). +For information about what's new in the .NET SDK, code analysis, and diagnostics, see [What's new in the SDK and tooling for .NET 8](sdk.md). ## C# 12 diff --git a/docs/core/whats-new/dotnet-9/libraries.md b/docs/core/whats-new/dotnet-9/libraries.md index e3430966e2686..88f8d5bc981f5 100644 --- a/docs/core/whats-new/dotnet-9/libraries.md +++ b/docs/core/whats-new/dotnet-9/libraries.md @@ -84,8 +84,41 @@ KMAC is available on Linux with OpenSSL 3.0 or later, and on Windows 11 Build 26 ## Reflection -In .NET Core versions and .NET 5-8, support for building an assembly and emitting reflection metadata for dynamically created types was limited to a runnable . The lack of support for *saving* an assembly was often a blocker for customers migrating from .NET Framework to .NET. .NET 9 adds public APIs to to save an emitted assembly. +In .NET Core versions and .NET 5-8, support for building an assembly and emitting reflection metadata for dynamically created types was limited to a runnable . The lack of support for *saving* an assembly was often a blocker for customers migrating from .NET Framework to .NET. .NET 9 adds a new type, , that you can use to save an emitted assembly. -The new, persisted implementation is runtime and platform independent. To create a persisted `AssemblyBuilder` instance, use the new API. The existing API accepts the assembly name and optional custom attributes. To use the new API, pass the core assembly, `System.Private.CoreLib`, which is used for referencing base runtime types. There's no option for . And for now, the persisted `AssemblyBuilder` implementation only supports saving, not running. After you create an instance of the persisted `AssemblyBuilder`, the subsequent steps for defining a module, type, method, or enum, writing IL, and all other usages remain unchanged. That means you can use existing code as-is for saving the assembly. The following code shows an example. +To create a `PersistedAssemblyBuilder` instance, call its constructor and pass the assembly name, the core assembly, `System.Private.CoreLib`, to reference base runtime types, and optional custom attributes. After you emit all members to the assembly, call the `PersistedAssemblyBuilder.Save(string assemblyFileName)` method to create an assembly with default settings. If you want to set the entry point or other options, you can call `PersistedAssemblyBuilder.GenerateMetadata(out BlobBuilder ilStream, out BlobBuilder mappedFieldData)` and use the metadata it returns to save the assembly. The following code shows an example of creating a persisted assembly and setting the entry point. :::code language="csharp" source="../snippets/dotnet-9/csharp/Reflection.cs" id="SaveAssembly"::: + +## Tokenizers + +Tokenization is a fundamental component in the preprocessing of natural language text for AI models. Tokenizers are responsible for breaking down a string of text into smaller, more manageable parts, often referred to as *tokens*. When using services like Azure OpenAI, you can use tokenizers to get a better understanding of cost and manage context. When working with self-hosted or local models, tokens are the inputs provided to those models. + +[Microsoft.ML.Tokenizers](https://devblogs.microsoft.com/dotnet/announcing-ml-net-2-0/#tokenizer-support) is an open-source, cross-platform tokenization library. When it was introduced, the library was scoped to the [Byte-Pair Encoding (BPE)](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokenization strategy to satisfy the language set of scenarios in ML.NET. .NET 9 adds the following enhancements to the library: + +- Refines APIs and existing functionality. +- Adds `Tiktoken` support. +- Adds `LlamaTokenizer` support. +- Adds support for scenarios covered by the `DeepDev` and `SharpToken` libraries. If you're using `DeepDev` or `SharpToken`, we recommend migrating to `Microsoft.ML.Tokenizers`. For more details, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md). + +The following examples show how to use the `Tiktoken` and `LlamaTokenizer` text tokenizers. + +Use `Tiktoken` tokenizer: + +:::code language="csharp" source="../snippets/dotnet-9/csharp/Tiktoken.cs" id="Tiktoken"::: + +Use `LlamaTokenizer` tokenizer: + +:::code language="csharp" source="../snippets/dotnet-9/csharp/LlamaTokenizer.cs" id="Llama"::: + +## New TimeSpan.From\* overloads + +The class offers several `From*` methods that let you create a `TimeSpan` object using a `double`. However, since `double` is a binary-based floating-point format, [inherent imprecision can lead to errors](https://github.com/dotnet/runtime/issues/93890). For instance, `TimeSpan.FromSeconds(101.832)` might not precisely represent `101 seconds, 832 milliseconds`, but rather approximately `101 seconds, 831.9999999999936335370875895023345947265625 milliseconds`. This discrepancy has caused frequent confusion, and it's also not the most efficient way to represent such data. To address this, .NET 9 adds new overloads that let you create `TimeSpan` objects from integers. There are new overloads from `FromDays`, `FromHours`, `FromMinutes`, `FromSeconds`, `FromMilliseconds`, and `FromMicroseconds`. + +The following code shows an example of calling the `double` and one of the new integer overloads. + +:::code language="csharp" source="../snippets/dotnet-9/csharp/TimeSpan.cs" id="TimeSpan.From"::: + +## `ActivatorUtilities.CreateInstance` constructor + +The constructor resolution for `ActivatorUtilities.CreateInstance()` has changed in .NET 9. Previously, a constructor that was explicitly marked using the `[ActivatorUtilitiesConstructor]` attribute might not be called, depending on the ordering of constructors and the number of constructor parameters. The logic has changed in .NET 9 such that a constructor that has the attribute is always called. diff --git a/docs/core/whats-new/dotnet-9/overview.md b/docs/core/whats-new/dotnet-9/overview.md index 6482c7afc1ab2..41dae829a7a64 100644 --- a/docs/core/whats-new/dotnet-9/overview.md +++ b/docs/core/whats-new/dotnet-9/overview.md @@ -15,6 +15,50 @@ New for .NET 9, the engineering team posts .NET 9 preview updates on [GitHub Dis This article has been updated for .NET 9 Preview 3. +## .NET runtime + +The .NET 9 runtime includes performance improvements, including faster exception handling, inlining improvements for Native AOT, and loop and PGO improvements. For more information, see [What's new in the .NET 9 runtime](runtime.md). + +## .NET libraries + +The .NET 9 libraries include improvements to ... For more information, see [What's new in the .NET 9 libraries](libraries.md). + +## .NET SDK + +For information about what's new in the .NET SDK, see [What's new in the SDK and tooling for .NET 9](sdk.md). + +## .NET Aspire + +.NET Aspire is an opinionated, cloud-ready stack for building observable, production ready, distributed applications.​ .NET Aspire is delivered through a collection of NuGet packages that handle specific cloud-native concerns, and is available in preview for .NET 9. For more information, see [.NET Aspire (Preview)](/dotnet/aspire). + +## ASP.NET Core + +ASP.NET Core includes improvements to ... For more information, see [What's new in ASP.NET Core 9.0](/aspnet/core/release-notes/aspnetcore-9.0). + +## .NET MAUI + +.NET MAUI includes ... For more information, see [What's new in .NET MAUI for .NET 9](/dotnet/maui/whats-new/dotnet-9). + +## EF Core + +Entity Framework Core includes improvements to ... For more information, see [What's New in EF Core 9](/ef/core/what-is-new/ef-core-9.0/whatsnew). + + + ## See also - [Our vision for .NET 9](https://devblogs.microsoft.com/dotnet/our-vision-for-dotnet-9/) blog post diff --git a/docs/core/whats-new/dotnet-9/runtime.md b/docs/core/whats-new/dotnet-9/runtime.md index dffaca073c50d..6b732d3144aa8 100644 --- a/docs/core/whats-new/dotnet-9/runtime.md +++ b/docs/core/whats-new/dotnet-9/runtime.md @@ -89,7 +89,7 @@ Determining the type of an object requires a call into the runtime, which comes A new `EncodeToUtf8` implementation takes advantage of the 64-bit JIT compiler's ability to emit multi-register load/store instructions on Arm64. This behavior allows programs to process larger chunks of data with fewer instructions. .NET apps across various domains should see throughput improvements on Arm64 hardware that supports these features. Some [benchmarks](https://github.com/dotnet/perf-autofiling-issues/issues/27114) cut their execution time by more than half. -## Faster exceptions +### Faster exceptions The CoreCLR runtime has adopted a new exception handling approach that improves the performance of exception handling. The new implementation is based on the NativeAOT runtime's exception-handling model. The change removes support for Windows structured exception handling (SEH) and its emulation on Unix. The new approach is supported in all environment except for Windows x86 (32-bit). diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/LlamaTokenizer.cs b/docs/core/whats-new/snippets/dotnet-9/csharp/LlamaTokenizer.cs new file mode 100644 index 0000000000000..25fc9bfaab108 --- /dev/null +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/LlamaTokenizer.cs @@ -0,0 +1,71 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Threading.Tasks; +using Microsoft.ML.Tokenizers; + +internal class LlamaTokenizerExample +{ + public static async Task RunItAsync() + { + // Create the Tokenizer. + HttpClient httpClient = new HttpClient(); + string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"; + using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl); + Tokenizer tokenizer = Tokenizer.CreateLlama(remoteStream); + + string text = "Hello, World!"; + + // Encode to IDs. + IReadOnlyList encodedIds = tokenizer.EncodeToIds(text); + Console.WriteLine($"encodedIds = {{{string.Join(", ", encodedIds)}}}"); + // encodedIds = {1, 15043, 29892, 2787, 29991} + + // Decode IDs to text. + string? decodedText = tokenizer.Decode(encodedIds); + Console.WriteLine($"decodedText = {decodedText}"); + // decodedText = Hello, World! + + // Get token count. + int idsCount = tokenizer.CountTokens(text); + Console.WriteLine($"idsCount = {idsCount}"); + // idsCount = 5 + + // Full encoding. + EncodingResult result = tokenizer.Encode(text); + Console.WriteLine($"result.Tokens = {{'{string.Join("', '", result.Tokens)}'}}"); + // result.Tokens = {'', '▁Hello', ',', '▁World', '!'} + Console.WriteLine($"result.Offsets = {{{string.Join(", ", result.Offsets)}}}"); + // result.Offsets = {(0, 0), (0, 6), (6, 1), (7, 6), (13, 1)} + Console.WriteLine($"result.Ids = {{{string.Join(", ", result.Ids)}}}"); + // result.Ids = {1, 15043, 29892, 2787, 29991} + + // Encode up to number of tokens limit. + int index1 = tokenizer.IndexOfTokenCount( + text, + maxTokenCount: 2, + out string processedText1, + out int tokenCount1 + );// Encode up to two tokens. + Console.WriteLine($"processedText1 = {processedText1}"); + // processedText1 = ▁Hello,▁World! + Console.WriteLine($"tokenCount1 = {tokenCount1}"); + // tokenCount1 = 2 + Console.WriteLine($"index1 = {index1}"); + // index1 = 6 + + int index2 = tokenizer.LastIndexOfTokenCount( + text, + maxTokenCount: 1, + out string processedText2, + out int tokenCount2 + ); // Encode from end up to one token. + Console.WriteLine($"processedText2 = {processedText2}"); + // processedText2 = ▁Hello,▁World! + Console.WriteLine($"tokenCount2 = {tokenCount2}"); + // tokenCount2 = 1 + Console.WriteLine($"index2 = {index2}"); + // index2 = 13 + } +} diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/Program.cs b/docs/core/whats-new/snippets/dotnet-9/csharp/Program.cs index 37d6389b7842f..7b8a052e2b7c1 100644 --- a/docs/core/whats-new/snippets/dotnet-9/csharp/Program.cs +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/Program.cs @@ -1,2 +1,3 @@ Linq.RunIt(); //Serialization.RunIt(); +//TimeSpan.RunIt(); diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/Project.csproj b/docs/core/whats-new/snippets/dotnet-9/csharp/Project.csproj index 413b632dec1c0..35ecdfd2d973e 100644 --- a/docs/core/whats-new/snippets/dotnet-9/csharp/Project.csproj +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/Project.csproj @@ -3,8 +3,13 @@ Exe net9 + enable + + + + PreserveNewest diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/Reflection.cs b/docs/core/whats-new/snippets/dotnet-9/csharp/Reflection.cs index 9df9ed87a8fbc..b5124751ed74c 100644 --- a/docs/core/whats-new/snippets/dotnet-9/csharp/Reflection.cs +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/Reflection.cs @@ -1,35 +1,56 @@ using System.Reflection.Emit; using System.Reflection; using System; +using System.Reflection.Metadata.Ecma335; +using System.Reflection.Metadata; +using System.Reflection.PortableExecutable; +using System.IO; internal class Reflection { // public void CreateAndSaveAssembly(string assemblyPath) { - AssemblyBuilder ab = AssemblyBuilder.DefinePersistedAssembly( + PersistedAssemblyBuilder ab = new PersistedAssemblyBuilder( new AssemblyName("MyAssembly"), typeof(object).Assembly ); TypeBuilder tb = ab.DefineDynamicModule("MyModule") .DefineType("MyType", TypeAttributes.Public | TypeAttributes.Class); - MethodBuilder mb = tb.DefineMethod( - "SumMethod", - MethodAttributes.Public | MethodAttributes.Static, - typeof(int), [typeof(int), typeof(int)] + MethodBuilder entryPoint = tb.DefineMethod( + "Main", + MethodAttributes.HideBySig | MethodAttributes.Public | MethodAttributes.Static ); - ILGenerator il = mb.GetILGenerator(); - il.Emit(OpCodes.Ldarg_0); - il.Emit(OpCodes.Ldarg_1); - il.Emit(OpCodes.Add); + ILGenerator il = entryPoint.GetILGenerator(); + // ... il.Emit(OpCodes.Ret); tb.CreateType(); - ab.Save(assemblyPath); // or could save to a Stream + + MetadataBuilder metadataBuilder = ab.GenerateMetadata( + out BlobBuilder ilStream, + out BlobBuilder fieldData + ); + PEHeaderBuilder peHeaderBuilder = new PEHeaderBuilder( + imageCharacteristics: Characteristics.ExecutableImage); + + ManagedPEBuilder peBuilder = new ManagedPEBuilder( + header: peHeaderBuilder, + metadataRootBuilder: new MetadataRootBuilder(metadataBuilder), + ilStream: ilStream, + mappedFieldData: fieldData, + entryPoint: MetadataTokens.MethodDefinitionHandle(entryPoint.MetadataToken) + ); + + BlobBuilder peBlob = new BlobBuilder(); + peBuilder.Serialize(peBlob); + + using var fileStream = new FileStream("MyAssembly.exe", FileMode.Create, FileAccess.Write); + peBlob.WriteContentTo(fileStream); } - public void UseAssembly(string assemblyPath) + public static void UseAssembly(string assemblyPath) { Assembly assembly = Assembly.LoadFrom(assemblyPath); Type type = assembly.GetType("MyType"); diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/Tiktoken.cs b/docs/core/whats-new/snippets/dotnet-9/csharp/Tiktoken.cs new file mode 100644 index 0000000000000..d9b876a5ec209 --- /dev/null +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/Tiktoken.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Tokenizers; + +internal class TiktokenExample +{ + public static void RunIt() + { + // + Tokenizer tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4"); + string text = "Hello, World!"; + + // Encode to IDs. + IReadOnlyList encodedIds = tokenizer.EncodeToIds(text); + Console.WriteLine($"encodedIds = {{{string.Join(", ", encodedIds)}}}"); + // encodedIds = {9906, 11, 4435, 0} + + // Decode IDs to text. + string decodedText = tokenizer.Decode(encodedIds); + Console.WriteLine($"decodedText = {decodedText}"); + // decodedText = Hello, World! + + // Get token count. + int idsCount = tokenizer.CountTokens(text); + Console.WriteLine($"idsCount = {idsCount}"); + // idsCount = 4 + + // Full encoding. + EncodingResult result = tokenizer.Encode(text); + Console.WriteLine($"result.Tokens = {{'{string.Join("', '", result.Tokens)}'}}"); + // result.Tokens = {'Hello', ',', ' World', '!'} + Console.WriteLine($"result.Offsets = {{{string.Join(", ", result.Offsets)}}}"); + // result.Offsets = {(0, 5), (5, 1), (6, 6), (12, 1)} + Console.WriteLine($"result.Ids = {{{string.Join(", ", result.Ids)}}}"); + // result.Ids = {9906, 11, 4435, 0} + + // Encode up to number of tokens limit. + int index1 = tokenizer.IndexOfTokenCount( + text, + maxTokenCount: 1, + out string processedText1, + out int tokenCount1 + ); // Encode up to one token. + Console.WriteLine($"processedText1 = {processedText1}"); + // processedText1 = Hello, World! + Console.WriteLine($"tokenCount1 = {tokenCount1}"); + // tokenCount1 = 1 + Console.WriteLine($"index1 = {index1}"); + // index1 = 5 + + int index2 = tokenizer.LastIndexOfTokenCount( + text, + maxTokenCount: 1, + out string processedText2, + out int tokenCount2 + ); // Encode from end up to one token. + Console.WriteLine($"processedText2 = {processedText2}"); + // processedText2 = Hello, World! + Console.WriteLine($"tokenCount2 = {tokenCount2}"); + // tokenCount2 = 1 + Console.WriteLine($"index2 = {index2}"); + // index2 = 12 + // + } +} diff --git a/docs/core/whats-new/snippets/dotnet-9/csharp/TimeSpan.cs b/docs/core/whats-new/snippets/dotnet-9/csharp/TimeSpan.cs new file mode 100644 index 0000000000000..acb39fd6ef5a4 --- /dev/null +++ b/docs/core/whats-new/snippets/dotnet-9/csharp/TimeSpan.cs @@ -0,0 +1,17 @@ +using System; + +internal class TimeSpanExample +{ + public static void RunIt() + { + // + TimeSpan timeSpan1 = TimeSpan.FromSeconds(value: 101.832); + Console.WriteLine($"timeSpan1 = {timeSpan1}"); + // timeSpan1 = 00:01:41.8319999 + + TimeSpan timeSpan2 = TimeSpan.FromSeconds(seconds: 101, milliseconds: 832); + Console.WriteLine($"timeSpan2 = {timeSpan2}"); + // timeSpan2 = 00:01:41.8320000 + // + } +}