I have the following custom WebApiSkill:
@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplit&PageSkill(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
try:
req_body = req.get_json()
except ValueError:
return func.HttpResponse("Invalid input", status_code=400)
try:
# 'values' expected top-level key in the request body
response_body = {"values": []}
for value in req_body.get('values', []):
recordId = value.get('recordId')
text = value.get('data', {}).get('text', '')
# Remove sequences of dots, numbers following them, and
# any additional punctuation or newline characters, replacing them with a single space
cleaned_text = re.sub(r"[',.\n]+|\d+", ' ', text)
# Replace multiple spaces with a single space and trim leading/trailing spaces
cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()
# Pattern to match sequences of ". " occurring more than twice
cleaned_text = re.sub(r"(\. ){3,}", "", cleaned_text)
chunks, page_numbers = split_text_into_chunks_with_overlap(cleaned_text, chunk_size=256, overlap_size=20)
# response object for specific pdf
response_record = {
"recordId": recordId,
"data": {
"textItems": chunks, # chunks is a str list
"numberItems": page_numbers # page_numbers is an int list
}
}
response_body['values'].append(response_record)
return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
except ValueError:
return func.HttpResponse("Function app crashed", status_code=400)
The inputs and outputs of this skill in the skillset are defined like this:
inputs=[
InputFieldMappingEntry(name="text", source="/document/content")
],
outputs=[
OutputFieldMappingEntry(name="textItems", target_name="pages"),
OutputFieldMappingEntry(name="numberItems", target_name="numbers")
],
And the SearchIndexerIndexProjectionSelector is configured in the following way:
index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
parent_key_field_name="parent_id",
source_context="/document/pages/*",
mappings=[
InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
InputFieldMappingEntry(name="page_number", source="/document/numbers/*"),
],
),
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
),
)
My search fields look like this:
fields = [
SearchField(
name="parent_id",
type=SearchFieldDataType.String,
sortable=True,
filterable=True,
facetable=True
),
SearchField(
name="title",
type=SearchFieldDataType.String
),
SearchField(
name="chunk_id",
type=SearchFieldDataType.String,
key=True,
sortable=True,
filterable=True,
facetable=True,
analyzer_name="keyword"
),
SearchField(
name="chunk",
type=SearchFieldDataType.String,
sortable=False,
filterable=False,
facetable=False
),
SearchField(
name="vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=1536,
vector_search_profile_name="myHnswProfile"
),
SearchField(
name="page_number",
type=SearchFieldDataType.Int32,
sortable=True,
filterable=True,
facetable=True
),
]
I get the following error:
The data field 'page_number' in the document with key 'xyz' has an invalid value of type 'Edm.String' ('String maps to Edm.String'). The expected type was 'Edm.Int32'.
When changing the value to String the index creation passes, with the following result under page_numbers:
"page_number": "[1,2,3,4,5,6,7,...]"
But I want to get a single value under each chunk