From 41bef6b5d9e7d887073bfa98935c732960487961 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 15 Apr 2024 16:43:34 -0700 Subject: [PATCH] add ml.net overview and what's new --- docs/machine-learning/index.yml | 19 +++--- docs/machine-learning/overview.md | 21 ++++++ docs/machine-learning/toc.yml | 35 +++++----- docs/machine-learning/whats-new/overview.md | 53 +++++++++++++++ .../whats-new/snippets/csharp/Project.csproj | 13 ++++ .../whats-new/snippets/csharp/Tiktoken.cs | 65 +++++++++++++++++++ 6 files changed, 178 insertions(+), 28 deletions(-) create mode 100644 docs/machine-learning/overview.md create mode 100644 docs/machine-learning/whats-new/overview.md create mode 100644 docs/machine-learning/whats-new/snippets/csharp/Project.csproj create mode 100644 docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs diff --git a/docs/machine-learning/index.yml b/docs/machine-learning/index.yml index b56448fb17dd9..70b475ca45e60 100644 --- a/docs/machine-learning/index.yml +++ b/docs/machine-learning/index.yml @@ -1,29 +1,27 @@ ### YamlMime:Landing -title: ML.NET Documentation +title: ML.NET documentation summary: Learn how to use open-source ML.NET to build custom machine learning models and integrate them into apps. Tutorials, code examples, and more show you how. metadata: - title: ML.NET Documentation - Tutorials, API Reference + title: ML.NET documentation - Tutorials, API reference description: Learn how to use open-source ML.NET to build custom machine learning models and integrate them into apps. Tutorials, code examples, and more show you how. ms.service: dotnet-ml ms.topic: landing-page ms.collection: collection author: natke ms.author: nakersha - ms.date: 10/30/2019 - + ms.date: 04/15/2024 # linkListType: architecture | concept | deploy | download | get-started | how-to-guide | learn | overview | quickstart | reference | tutorial | video | whats-new landingContent: - # Basics - - title: ML.NET Basics + - title: ML.NET basics linkLists: - linkListType: overview links: - - text: What is ML.NET? + - text: What is the ML.NET API? url: how-does-mldotnet-work.md - text: What is Model Builder? url: automate-training-with-model-builder.md @@ -54,7 +52,7 @@ landingContent: - text: Install the CLI on macOS, Windows, or Linux (low-code) url: how-to-guides/install-ml-net-cli.md - # Tutorials - scenarios + # Tutorials - scenarios - title: Tutorials linkLists: - linkListType: learn @@ -84,7 +82,6 @@ landingContent: - text: Build a movie recommender (API) url: tutorials/movie-recommendation.md - # API Card - title: How-to guides linkLists: @@ -109,6 +106,8 @@ landingContent: - linkListType: reference links: - text: ML.NET API reference - url: ../../api/index.md?view=ml-dotnet + url: ../../api/index.md?view=ml-dotnet&preserve-view=true + - text: ML.NET CLI reference + url: reference/ml-net-cli-reference.md - text: ML.NET samples url: https://github.com/dotnet/machinelearning-samples diff --git a/docs/machine-learning/overview.md b/docs/machine-learning/overview.md new file mode 100644 index 0000000000000..ee0483c5654bc --- /dev/null +++ b/docs/machine-learning/overview.md @@ -0,0 +1,21 @@ +--- +title: Overview of ML.NET +description: Discover how to use the ML.NET CLI tool to automatically train the best model from the command-line. +ms.date: 04/15/2024 +--- + +# Overview of ML.NET + +ML.NET is an open-source, cross-platform machine learning framework for .NET developers that enables integration of custom machine learning models into .NET applications. It encompasses an [API](how-does-mldotnet-work.md), which consists of different NuGet packages, a Visual Studio extension called [Model Builder](automate-training-with-model-builder.md), and a [command-line interface](automate-training-with-cli.md) that's installed as a .NET tool. + +ML.NET packages: + +- [Microsoft.ML](https://www.nuget.org/packages/Microsoft.ML) +- [Microsoft.ML.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML) +- [Microsoft.ML.Probabilistic](https://www.nuget.org/packages/Microsoft.ML.Probabilistic) +- [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) +- [Many other packages](https://www.nuget.org/profiles/MLNET) + +Visual Studio extension: + +- [Model Builder extension for Visual Studio](https://marketplace.visualstudio.com/items?itemName=MLNET.ModelBuilder2022) diff --git a/docs/machine-learning/toc.yml b/docs/machine-learning/toc.yml index 5a39075836060..dc2807636777d 100644 --- a/docs/machine-learning/toc.yml +++ b/docs/machine-learning/toc.yml @@ -2,21 +2,24 @@ items: - name: ML.NET href: index.yml - name: Overview + href: overview.md +- name: Model Builder & CLI items: - - name: Model Builder & CLI - items: - - name: The ML.NET Model Builder tool - href: automate-training-with-model-builder.md - - name: The ML.NET command-line interface - href: automate-training-with-cli.md - displayName: cli - - name: API - items: - - name: The ML.NET API - href: how-does-mldotnet-work.md - - name: What is Automated Machine Learning (AutoML)? - href: automated-machine-learning-mlnet.md - expanded: true + - name: The ML.NET Model Builder tool + href: automate-training-with-model-builder.md + - name: The ML.NET command-line interface + href: automate-training-with-cli.md + displayName: cli + expanded: true +- name: API + items: + - name: The ML.NET API + href: how-does-mldotnet-work.md + - name: What is Automated Machine Learning (AutoML)? + href: automated-machine-learning-mlnet.md + expanded: true +- name: What's new + href: whats-new/overview.md - name: Tutorials items: - name: Model Builder & CLI @@ -136,10 +139,6 @@ items: href: /azure/machine-learning/how-to-use-automl-onnx-model-dotnet?toc=/dotnet/machine-learning/how-to-guides/toc.json&bc=/dotnet/machine-learning/how-to-guides/toc.json - name: Reference items: - - name: ML.NET API reference - href: ../../api/index.md?view=ml-dotnet&preserve-view=true - - name: ML.NET Preview API reference - href: ../../api/index.md?view=ml-dotnet-preview&preserve-view=true - name: CLI reference href: reference/ml-net-cli-reference.md - name: Resources diff --git a/docs/machine-learning/whats-new/overview.md b/docs/machine-learning/whats-new/overview.md new file mode 100644 index 0000000000000..26ca97880e577 --- /dev/null +++ b/docs/machine-learning/whats-new/overview.md @@ -0,0 +1,53 @@ +--- +title: What's new in ML.NET +titleSuffix: "" +description: Discover what's new in ML.NET. +ms.date: 04/15/2024 +ms.topic: whats-new + +#Customer intent: As a developer, I want to know what the new features are in ML.NET. + +--- + +# What's new in ML.NET + +> [!NOTE] +> This article is a work in progress. + +You can find all of the release notes for the ML.NET API in the [dotnet/machinelearning repo](https://github.com/dotnet/machinelearning/tree/main/docs/release-notes). + +## New deep-learning tasks + +ML.NET 3.0 added support for the following deep-learning tasks: + +- Object detection (backed by TorchSharp) +- Named entity recognition (NER) +- Question answering (QA) + +These trainers are included in the [Microsoft.ML.TorchSharp](https://www.nuget.org/packages/Microsoft.ML.TorchSharp) package. For more information, see [Announcing ML.NET 3.0](https://devblogs.microsoft.com/dotnet/announcing-ml-net-3-0/). + +## AutoML + +In ML.NET 3.0, the AutoML sweeper was updated to support the sentence similarity, question answering, and object detection tasks. For more information about AutoML, see [How to use the ML.NET Automated Machine Learning (AutoML) API](../how-to-guides/how-to-use-the-automl-api.md). + +## Additional tokenizer support + +[Microsoft.ML.Tokenizers](https://devblogs.microsoft.com/dotnet/announcing-ml-net-2-0/#tokenizer-support) is an open-source, cross-platform tokenization library. When it was introduced, the library was scoped to the [Byte-Pair Encoding (BPE)](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokenization strategy to satisfy the language set of scenarios in ML.NET. Version 4.0 Preview 1 added support for the `Tiktoken` tokenizer. + +The following examples show how to use the `Tiktoken` text tokenizer. + +:::code language="csharp" source="./snippets/csharp/Tiktoken.cs" id="Tiktoken"::: + +### About tokenization + +Tokenization is a fundamental component in the preprocessing of natural language text for AI models. Tokenizers are responsible for breaking down a string of text into smaller, more manageable parts, often referred to as *tokens*. When using services like Azure OpenAI, you can use tokenizers to get a better understanding of cost and manage context. When working with self-hosted or local models, tokens are the inputs provided to those models. + +## Model Builder (Visual Studio extension) + +Model Builder has been updated to consume the ML.NET 3.0 release. Model Builder version 17.18.0 added question answering (QA) and named entity recognition (NER) scenarios. + +You can find all of the Model Builder release notes in the [dotnet/machinelearning-modelbuilder repo](https://github.com/dotnet/machinelearning-modelbuilder/tree/main/docs/release-notes). + +## See also + +- [Blog post: Announcing ML.NET 3.0](https://devblogs.microsoft.com/dotnet/announcing-ml-net-3-0/) diff --git a/docs/machine-learning/whats-new/snippets/csharp/Project.csproj b/docs/machine-learning/whats-new/snippets/csharp/Project.csproj new file mode 100644 index 0000000000000..05d3e73559e06 --- /dev/null +++ b/docs/machine-learning/whats-new/snippets/csharp/Project.csproj @@ -0,0 +1,13 @@ + + + + Exe + net9 + enable + + + + + + + diff --git a/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs b/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs new file mode 100644 index 0000000000000..68b55b02ecb11 --- /dev/null +++ b/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Tokenizers; + +internal class TiktokenExample +{ + public static void RunIt() + { + // + Tokenizer tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4"); + string text = "Hello, World!"; + + // Encode to IDs. + IReadOnlyList encodedIds = tokenizer.EncodeToIds(text); + Console.WriteLine($"encodedIds = {{{string.Join(", ", encodedIds)}}}"); + // encodedIds = {9906, 11, 4435, 0} + + // Decode IDs to text. + string decodedText = tokenizer.Decode(encodedIds); + Console.WriteLine($"decodedText = {decodedText}"); + // decodedText = Hello, World! + + // Get token count. + int idsCount = tokenizer.CountTokens(text); + Console.WriteLine($"idsCount = {idsCount}"); + // idsCount = 4 + + // Full encoding. + EncodingResult result = tokenizer.Encode(text); + Console.WriteLine($"result.Tokens = {{'{string.Join("', '", result.Tokens)}'}}"); + // result.Tokens = {'Hello', ',', ' World', '!'} + Console.WriteLine($"result.Offsets = {{{string.Join(", ", result.Offsets)}}}"); + // result.Offsets = {(0, 5), (5, 1), (6, 6), (12, 1)} + Console.WriteLine($"result.Ids = {{{string.Join(", ", result.Ids)}}}"); + // result.Ids = {9906, 11, 4435, 0} + + // Encode up to number of tokens limit. + int index1 = tokenizer.IndexOfTokenCount( + text, + maxTokenCount: 1, + out string processedText1, + out int tokenCount1 + ); // Encode up to one token. + Console.WriteLine($"processedText1 = {processedText1}"); + // processedText1 = Hello, World! + Console.WriteLine($"tokenCount1 = {tokenCount1}"); + // tokenCount1 = 1 + Console.WriteLine($"index1 = {index1}"); + // index1 = 5 + + int index2 = tokenizer.LastIndexOfTokenCount( + text, + maxTokenCount: 1, + out string processedText2, + out int tokenCount2 + ); // Encode from end up to one token. + Console.WriteLine($"processedText2 = {processedText2}"); + // processedText2 = Hello, World! + Console.WriteLine($"tokenCount2 = {tokenCount2}"); + // tokenCount2 = 1 + Console.WriteLine($"index2 = {index2}"); + // index2 = 12 + // + } +}