Multilingual Tensor Search
Introduction
In this guide, we'll walk through setting up a multi-lingual search index using Marqo.
Getting Started
Before diving into the code, ensure you have the following prerequisites completed:
- 
Clone the Repository
Get the example files by cloning the repository:git clone --branch 2.0.0 https://github.com/marqo-ai/marqo.git cd marqo/examples/MultiLingual - 
Run Marqo
Start the Marqo service using Docker with the following commands:docker rm -f marqo docker pull marqoai/marqo:2.0.0 docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0 - 
Refer to the Original Code
The full example code is available here. 
Building the Index
To create a multi-lingual index, we'll perform the following steps:
1. Setup and Imports
Start by setting up your environment and importing necessary libraries:
from marqo import Client
from datasets import load_dataset
import datetime
import json
import pprint
import logging
# Define the index name
INDEX_NAME = "my-multilingual-index"
# Initialize logging for HTTP request information
logging.basicConfig(level=logging.DEBUG)
# Initialize the Marqo client
mq = Client("http://localhost:8882")
2. Loading the Data
Load the MultiEURLEX dataset, focusing on English and German validation splits:
dataset_en = load_dataset("multi_eurlex", "en", split="validation")
dataset_de = load_dataset("multi_eurlex", "de", split="validation")
3. Creating the Index
Create the index using the chosen multilingual model:
mq.create_index(index_name=INDEX_NAME, model="stsb-xlm-r-multilingual")
4. Preparing the Documents and Indexing them
Handle large documents by splitting them into smaller parts for easier search, then we index documents by posting them to the Marqo index:
MAX_TEXT_LENGTH = 100000
for ds, lang in [(dataset_en, "en"), (dataset_de, "de")]:
    num_docs_in_dataset = len(ds)
    for ii, doc in enumerate(ds):
        dumped = json.dumps(doc)
        # we'll set the doc ID to be the document's hash
        doc_id = str(hash(dumped))
        text_length = len(doc["text"])
        split_size = MAX_TEXT_LENGTH // 2
        # break up the text of large documents:
        if text_length > MAX_TEXT_LENGTH:
            text_splits = [
                doc["text"][i : i + split_size]
                for i in range(0, text_length, split_size)
            ]
        else:
            text_splits = [doc["text"]]
        for i, sub_doc in enumerate(text_splits):
            # if a document is broken up, add the text's index to the end of the document:
            qualified_id = f"{doc_id}.{i}" if len(text_splits) > 1 else doc_id
            # create a dict to be posted
            to_post = dict(
                [
                    (k, v) if k != "labels" else (k, str(v))
                    for k, v in doc.items()
                    if k != "text"
                ]
                + [("_id", qualified_id), ("language", lang), ("text", sub_doc)]
            )
            print(
                f"doc number {ii} out of {num_docs_in_dataset} docs in dataset {lang}. "
                f"_id: {qualified_id}, celex_id: {doc['celex_id']}, "
                f"json to send size: {len(json.dumps(to_post))}"
            )
            # Index a large number of documents.
            mq.index(index_name=INDEX_NAME).add_documents(
                documents=[to_post],
                tensor_fields=["text", "language"],
                client_batch_size=64,
            )
t1 = datetime.datetime.now()
print(f"finished indexing. Started at {t0}. Finished at {t1}. Took {t1 - t0}")
Searching the Index
Once indexing is complete, you can perform searches using the following function:
1. Define the Search Function
Create a function to search for a query in the specified language:
def search(q):
    result = mq.index(INDEX_NAME).search(q=q)
    for res in result["hits"]:
        pprint.pprint(res["_highlights"])
2. Execute a Search
Test the search with a query of your choice:
# Replace 'my_search_query' with your search text
my_search_query = "Laws about the fishing industry"
search(my_search_query)
Full Code
Example
"""
This example uses the MultiEURLEX dataset.
Log from running:
Took 45 minutes on ml.g4dn.2xlarge
"""
# change this to 'cpu' if the machine you are running Marqo on doesn't have a
# Nvidia GPU
DEVICE = "cuda"
# import marqo:
from marqo import Client
# import the huggingface datasets package:
from datasets import load_dataset
# import other python packages
import datetime
import json
import pprint
import logging
# this will be the name of the index:
INDEX_NAME = "my-multilingual-index"
# this helps us see information about the HTTP requests
logging.basicConfig(level=logging.DEBUG)
# Create a new Marqo client:
mq = Client("http://localhost:8882")
def build_index():
    # Load the datasets. For this example we're just using the English and
    # Deutsch validation splits:
    dataset_en = load_dataset('multi_eurlex', 'en', split="validation")
    dataset_de = load_dataset('multi_eurlex', 'de', split="validation")
    # record the start time:
    t0 = datetime.datetime.now()
    try:
        mq.index(INDEX_NAME).delete()
    except:
        pass
    # Create the index. The model we're using is multilingual:
    mq.create_index(index_name=INDEX_NAME, model='stsb-xlm-r-multilingual')
    # Let's break up large documents to make it easier to search:
    MAX_TEXT_LENGTH = 100000
    for ds, lang in [(dataset_en, "en"), (dataset_de, "de")]:
        num_docs_in_dataset = len(ds)
        for ii, doc in enumerate(ds):
            dumped = json.dumps(doc)
            # we'll set the doc ID to be the document's hash
            doc_id = str(hash(dumped))
            text_length = len(doc['text'])
            split_size = MAX_TEXT_LENGTH//2
            # break up the text of large documents:
            if text_length > MAX_TEXT_LENGTH:
                text_splits = [doc['text'][i: i + split_size] for i in range(0, text_length, split_size)]
            else:
                text_splits = [doc['text']]
            for i, sub_doc in enumerate(text_splits):
                # if a document is broken up, add the text's index to the end of the document:
                qualified_id = f"{doc_id}.{i}" if len(text_splits) > 1 else doc_id
                # create a dict to be posted
                to_post = dict(
                    [(k, v) if k != "labels" else (k, str(v)) for k, v in doc. items() if k != 'text']
                    + [("_id", qualified_id), ("language", lang), ('text', sub_doc)]
                )
                print(f"doc number {ii} out of {num_docs_in_dataset} docs in dataset {lang}. "
                      f"_id: {qualified_id}, celex_id: {doc['celex_id']}, "
                      f"json to send size: {len(json.dumps(to_post))}")
                # Index the document. The device is set to 'cuda' to take
                # advantage of the machine's GPU. If you don't have a GPU,
                # change this argument to 'cpu'.
                mq.index(index_name=INDEX_NAME).add_documents(
                    documents=[to_post], device=DEVICE,
                    tensor_fields=["language", "text", "labels"]
                )
    t1 = datetime.datetime.now()
    print(f"finished indexing. Started at {t0}. Finished at {t1}. Took {t1 - t0}")
def search(q):
    result = mq.index(INDEX_NAME).search(q=q)
    # Just print out the highlights, which makes the output easier to read
    for res in result["hits"]:
        pprint.pprint(res["_highlights"])
# After you finishing indexing, comment out the following line to prevent going through
# the whole indexing process again.
build_index()
# Replace 'my_search_query' with whatever text you want to search. In English or Deutsch!
my_search_query = "Laws about the fishing industry"
search(my_search_query)