From 41bef6b5d9e7d887073bfa98935c732960487961 Mon Sep 17 00:00:00 2001
From: Genevieve Warren <24882762+gewarren@users.noreply.github.com>
Date: Mon, 15 Apr 2024 16:43:34 -0700
Subject: [PATCH] add ml.net overview and what's new
---
docs/machine-learning/index.yml | 19 +++---
docs/machine-learning/overview.md | 21 ++++++
docs/machine-learning/toc.yml | 35 +++++-----
docs/machine-learning/whats-new/overview.md | 53 +++++++++++++++
.../whats-new/snippets/csharp/Project.csproj | 13 ++++
.../whats-new/snippets/csharp/Tiktoken.cs | 65 +++++++++++++++++++
6 files changed, 178 insertions(+), 28 deletions(-)
create mode 100644 docs/machine-learning/overview.md
create mode 100644 docs/machine-learning/whats-new/overview.md
create mode 100644 docs/machine-learning/whats-new/snippets/csharp/Project.csproj
create mode 100644 docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs
diff --git a/docs/machine-learning/index.yml b/docs/machine-learning/index.yml
index b56448fb17dd9..70b475ca45e60 100644
--- a/docs/machine-learning/index.yml
+++ b/docs/machine-learning/index.yml
@@ -1,29 +1,27 @@
### YamlMime:Landing
-title: ML.NET Documentation
+title: ML.NET documentation
summary: Learn how to use open-source ML.NET to build custom machine learning models and integrate them into apps. Tutorials, code examples, and more show you how.
metadata:
- title: ML.NET Documentation - Tutorials, API Reference
+ title: ML.NET documentation - Tutorials, API reference
description: Learn how to use open-source ML.NET to build custom machine learning models and integrate them into apps. Tutorials, code examples, and more show you how.
ms.service: dotnet-ml
ms.topic: landing-page
ms.collection: collection
author: natke
ms.author: nakersha
- ms.date: 10/30/2019
-
+ ms.date: 04/15/2024
# linkListType: architecture | concept | deploy | download | get-started | how-to-guide | learn | overview | quickstart | reference | tutorial | video | whats-new
landingContent:
-
# Basics
- - title: ML.NET Basics
+ - title: ML.NET basics
linkLists:
- linkListType: overview
links:
- - text: What is ML.NET?
+ - text: What is the ML.NET API?
url: how-does-mldotnet-work.md
- text: What is Model Builder?
url: automate-training-with-model-builder.md
@@ -54,7 +52,7 @@ landingContent:
- text: Install the CLI on macOS, Windows, or Linux (low-code)
url: how-to-guides/install-ml-net-cli.md
- # Tutorials - scenarios
+ # Tutorials - scenarios
- title: Tutorials
linkLists:
- linkListType: learn
@@ -84,7 +82,6 @@ landingContent:
- text: Build a movie recommender (API)
url: tutorials/movie-recommendation.md
-
# API Card
- title: How-to guides
linkLists:
@@ -109,6 +106,8 @@ landingContent:
- linkListType: reference
links:
- text: ML.NET API reference
- url: ../../api/index.md?view=ml-dotnet
+ url: ../../api/index.md?view=ml-dotnet&preserve-view=true
+ - text: ML.NET CLI reference
+ url: reference/ml-net-cli-reference.md
- text: ML.NET samples
url: https://github.com/dotnet/machinelearning-samples
diff --git a/docs/machine-learning/overview.md b/docs/machine-learning/overview.md
new file mode 100644
index 0000000000000..ee0483c5654bc
--- /dev/null
+++ b/docs/machine-learning/overview.md
@@ -0,0 +1,21 @@
+---
+title: Overview of ML.NET
+description: Discover how to use the ML.NET CLI tool to automatically train the best model from the command-line.
+ms.date: 04/15/2024
+---
+
+# Overview of ML.NET
+
+ML.NET is an open-source, cross-platform machine learning framework for .NET developers that enables integration of custom machine learning models into .NET applications. It encompasses an [API](how-does-mldotnet-work.md), which consists of different NuGet packages, a Visual Studio extension called [Model Builder](automate-training-with-model-builder.md), and a [command-line interface](automate-training-with-cli.md) that's installed as a .NET tool.
+
+ML.NET packages:
+
+- [Microsoft.ML](https://www.nuget.org/packages/Microsoft.ML)
+- [Microsoft.ML.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML)
+- [Microsoft.ML.Probabilistic](https://www.nuget.org/packages/Microsoft.ML.Probabilistic)
+- [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers)
+- [Many other packages](https://www.nuget.org/profiles/MLNET)
+
+Visual Studio extension:
+
+- [Model Builder extension for Visual Studio](https://marketplace.visualstudio.com/items?itemName=MLNET.ModelBuilder2022)
diff --git a/docs/machine-learning/toc.yml b/docs/machine-learning/toc.yml
index 5a39075836060..dc2807636777d 100644
--- a/docs/machine-learning/toc.yml
+++ b/docs/machine-learning/toc.yml
@@ -2,21 +2,24 @@ items:
- name: ML.NET
href: index.yml
- name: Overview
+ href: overview.md
+- name: Model Builder & CLI
items:
- - name: Model Builder & CLI
- items:
- - name: The ML.NET Model Builder tool
- href: automate-training-with-model-builder.md
- - name: The ML.NET command-line interface
- href: automate-training-with-cli.md
- displayName: cli
- - name: API
- items:
- - name: The ML.NET API
- href: how-does-mldotnet-work.md
- - name: What is Automated Machine Learning (AutoML)?
- href: automated-machine-learning-mlnet.md
- expanded: true
+ - name: The ML.NET Model Builder tool
+ href: automate-training-with-model-builder.md
+ - name: The ML.NET command-line interface
+ href: automate-training-with-cli.md
+ displayName: cli
+ expanded: true
+- name: API
+ items:
+ - name: The ML.NET API
+ href: how-does-mldotnet-work.md
+ - name: What is Automated Machine Learning (AutoML)?
+ href: automated-machine-learning-mlnet.md
+ expanded: true
+- name: What's new
+ href: whats-new/overview.md
- name: Tutorials
items:
- name: Model Builder & CLI
@@ -136,10 +139,6 @@ items:
href: /azure/machine-learning/how-to-use-automl-onnx-model-dotnet?toc=/dotnet/machine-learning/how-to-guides/toc.json&bc=/dotnet/machine-learning/how-to-guides/toc.json
- name: Reference
items:
- - name: ML.NET API reference
- href: ../../api/index.md?view=ml-dotnet&preserve-view=true
- - name: ML.NET Preview API reference
- href: ../../api/index.md?view=ml-dotnet-preview&preserve-view=true
- name: CLI reference
href: reference/ml-net-cli-reference.md
- name: Resources
diff --git a/docs/machine-learning/whats-new/overview.md b/docs/machine-learning/whats-new/overview.md
new file mode 100644
index 0000000000000..26ca97880e577
--- /dev/null
+++ b/docs/machine-learning/whats-new/overview.md
@@ -0,0 +1,53 @@
+---
+title: What's new in ML.NET
+titleSuffix: ""
+description: Discover what's new in ML.NET.
+ms.date: 04/15/2024
+ms.topic: whats-new
+
+#Customer intent: As a developer, I want to know what the new features are in ML.NET.
+
+---
+
+# What's new in ML.NET
+
+> [!NOTE]
+> This article is a work in progress.
+
+You can find all of the release notes for the ML.NET API in the [dotnet/machinelearning repo](https://github.com/dotnet/machinelearning/tree/main/docs/release-notes).
+
+## New deep-learning tasks
+
+ML.NET 3.0 added support for the following deep-learning tasks:
+
+- Object detection (backed by TorchSharp)
+- Named entity recognition (NER)
+- Question answering (QA)
+
+These trainers are included in the [Microsoft.ML.TorchSharp](https://www.nuget.org/packages/Microsoft.ML.TorchSharp) package. For more information, see [Announcing ML.NET 3.0](https://devblogs.microsoft.com/dotnet/announcing-ml-net-3-0/).
+
+## AutoML
+
+In ML.NET 3.0, the AutoML sweeper was updated to support the sentence similarity, question answering, and object detection tasks. For more information about AutoML, see [How to use the ML.NET Automated Machine Learning (AutoML) API](../how-to-guides/how-to-use-the-automl-api.md).
+
+## Additional tokenizer support
+
+[Microsoft.ML.Tokenizers](https://devblogs.microsoft.com/dotnet/announcing-ml-net-2-0/#tokenizer-support) is an open-source, cross-platform tokenization library. When it was introduced, the library was scoped to the [Byte-Pair Encoding (BPE)](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokenization strategy to satisfy the language set of scenarios in ML.NET. Version 4.0 Preview 1 added support for the `Tiktoken` tokenizer.
+
+The following examples show how to use the `Tiktoken` text tokenizer.
+
+:::code language="csharp" source="./snippets/csharp/Tiktoken.cs" id="Tiktoken":::
+
+### About tokenization
+
+Tokenization is a fundamental component in the preprocessing of natural language text for AI models. Tokenizers are responsible for breaking down a string of text into smaller, more manageable parts, often referred to as *tokens*. When using services like Azure OpenAI, you can use tokenizers to get a better understanding of cost and manage context. When working with self-hosted or local models, tokens are the inputs provided to those models.
+
+## Model Builder (Visual Studio extension)
+
+Model Builder has been updated to consume the ML.NET 3.0 release. Model Builder version 17.18.0 added question answering (QA) and named entity recognition (NER) scenarios.
+
+You can find all of the Model Builder release notes in the [dotnet/machinelearning-modelbuilder repo](https://github.com/dotnet/machinelearning-modelbuilder/tree/main/docs/release-notes).
+
+## See also
+
+- [Blog post: Announcing ML.NET 3.0](https://devblogs.microsoft.com/dotnet/announcing-ml-net-3-0/)
diff --git a/docs/machine-learning/whats-new/snippets/csharp/Project.csproj b/docs/machine-learning/whats-new/snippets/csharp/Project.csproj
new file mode 100644
index 0000000000000..05d3e73559e06
--- /dev/null
+++ b/docs/machine-learning/whats-new/snippets/csharp/Project.csproj
@@ -0,0 +1,13 @@
+
+
+
+ Exe
+ net9
+ enable
+
+
+
+
+
+
+
diff --git a/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs b/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs
new file mode 100644
index 0000000000000..68b55b02ecb11
--- /dev/null
+++ b/docs/machine-learning/whats-new/snippets/csharp/Tiktoken.cs
@@ -0,0 +1,65 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML.Tokenizers;
+
+internal class TiktokenExample
+{
+ public static void RunIt()
+ {
+ //
+ Tokenizer tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4");
+ string text = "Hello, World!";
+
+ // Encode to IDs.
+ IReadOnlyList encodedIds = tokenizer.EncodeToIds(text);
+ Console.WriteLine($"encodedIds = {{{string.Join(", ", encodedIds)}}}");
+ // encodedIds = {9906, 11, 4435, 0}
+
+ // Decode IDs to text.
+ string decodedText = tokenizer.Decode(encodedIds);
+ Console.WriteLine($"decodedText = {decodedText}");
+ // decodedText = Hello, World!
+
+ // Get token count.
+ int idsCount = tokenizer.CountTokens(text);
+ Console.WriteLine($"idsCount = {idsCount}");
+ // idsCount = 4
+
+ // Full encoding.
+ EncodingResult result = tokenizer.Encode(text);
+ Console.WriteLine($"result.Tokens = {{'{string.Join("', '", result.Tokens)}'}}");
+ // result.Tokens = {'Hello', ',', ' World', '!'}
+ Console.WriteLine($"result.Offsets = {{{string.Join(", ", result.Offsets)}}}");
+ // result.Offsets = {(0, 5), (5, 1), (6, 6), (12, 1)}
+ Console.WriteLine($"result.Ids = {{{string.Join(", ", result.Ids)}}}");
+ // result.Ids = {9906, 11, 4435, 0}
+
+ // Encode up to number of tokens limit.
+ int index1 = tokenizer.IndexOfTokenCount(
+ text,
+ maxTokenCount: 1,
+ out string processedText1,
+ out int tokenCount1
+ ); // Encode up to one token.
+ Console.WriteLine($"processedText1 = {processedText1}");
+ // processedText1 = Hello, World!
+ Console.WriteLine($"tokenCount1 = {tokenCount1}");
+ // tokenCount1 = 1
+ Console.WriteLine($"index1 = {index1}");
+ // index1 = 5
+
+ int index2 = tokenizer.LastIndexOfTokenCount(
+ text,
+ maxTokenCount: 1,
+ out string processedText2,
+ out int tokenCount2
+ ); // Encode from end up to one token.
+ Console.WriteLine($"processedText2 = {processedText2}");
+ // processedText2 = Hello, World!
+ Console.WriteLine($"tokenCount2 = {tokenCount2}");
+ // tokenCount2 = 1
+ Console.WriteLine($"index2 = {index2}");
+ // index2 = 12
+ //
+ }
+}