Documentexctractionskill issue in azure AI search

Question

I have a blob storage with pdf files, where i want to implement knowledge mining solution. I created the data source, the index, the skillset, and the indexer. However, when I run everything i get a warning that the "Could not execute skill because one or more skill input was invalid.". am i not using the skill properly? Is there an input im missing?

skillset code:


# Function to create a skillset for document extraction
def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
    credential = AzureKeyCredential(search_service_api_key)
    indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)

    # Define skills
    doc_extraction_skill = DocumentExtractionSkill(
        name="documentExtractionSkill",
        description="Extract text from documents",
        context="/document",
        configuration={"imageAction": "generateNormalizedImagePerPage"},
        inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
        outputs=[OutputFieldMappingEntry(name="content", target_name="/documents/content")]
    )

    # Create skillset
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        skills=[doc_extraction_skill]
    )

    # Create skillset in Azure Cognitive Search
    indexer_client.create_skillset(skillset)
    print(f"Skillset '{skillset_name}' created successfully.")

indexer:


def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
    credential = AzureKeyCredential(search_service_api_key)
    indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
    field_mappings = [
        FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
        FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
        FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
        FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
    ]

    output_field_mappings = [
        FieldMapping(source_field_name="/document/content", target_field_name= "content"),
    ]
    # Define indexing parameters
    indexing_parameters = IndexingParameters(
        configuration={
            "indexStorageMetadataOnlyForOversizedDocuments": True,
            "failOnUnsupportedContentType": False,
            "indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
            "parseJson": True,
            "parsingMode": "default",
            "allowSkillsetToReadFileData": False
        }
    )

    indexer = SearchIndexer(
        name=indexer_name,
        data_source_name=data_source_name,
        target_index_name=index_name,
        skillset_name=skillset_name,
        field_mappings=field_mappings,
        output_field_mappings=output_field_mappings,
        schedule=IndexingSchedule(interval="PT15M"),
        parameters=indexing_parameters
    )

    indexer_client.create_indexer(indexer)
    print(f"Indexer '{indexer_name}' created.")

Accepted Answer

@Karim Alameh The error message typically indicates that the input provided to the skill is either missing, has the wrong type, or is otherwise invalid.

Here are a few things to check and consider:

Input Field Mapping: Ensure that the input field mapping for file_data is correctly specified. The source should be /document/file_data, but make sure that this path is correct and that the data exists at this path in your documents.
Skill Configuration: Verify that the configuration parameters for the DocumentExtractionSkill are correctly set. For example, the imageAction parameter should be set to a valid value like generateNormalizedImagePerPage if you want to generate images per page.
Indexing Parameters: In your indexer, the parameter allowSkillsetToReadFileData is set to False. This might prevent the skillset from accessing the file data. Try setting this parameter to True to allow the skillset to read the file data.
Skill Inputs and Outputs: Ensure that the inputs and outputs for the skill are correctly defined. The input should be mapped to the correct source field, and the output should be mapped to the correct target field.

Here’s a revised version of your skillset creation code with these considerations:

def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
    credential = AzureKeyCredential(search_service_api_key)
    indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
    # Define skills
    doc_extraction_skill = DocumentExtractionSkill(
        name="documentExtractionSkill",
        description="Extract text from documents",
        context="/document",
        configuration={"imageAction": "generateNormalizedImagePerPage"},
        inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
        outputs=[OutputFieldMappingEntry(name="content", target_name="/document/content")]
    )
    # Create skillset
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        skills=[doc_extraction_skill]
    )
    # Create skillset in Azure Cognitive Search
    indexer_client.create_skillset(skillset)
    print(f"Skillset '{skillset_name}' created successfully.")

And for the indexer:

def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
    credential = AzureKeyCredential(search_service_api_key)
    indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
    field_mappings = [
        FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
        FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
        FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
        FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
    ]
    output_field_mappings = [
        FieldMapping(source_field_name="/document/content", target_field_name="content"),
    ]
    # Define indexing parameters
    indexing_parameters = IndexingParameters(
        configuration={
            "indexStorageMetadataOnlyForOversizedDocuments": True,
            "failOnUnsupportedContentType": False,
            "indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
            "parseJson": True,
            "parsingMode": "default",
            "allowSkillsetToReadFileData": True  # Set this to True
        }
    )
    indexer = SearchIndexer(
        name=indexer_name,
        data_source_name=data_source_name,
        target_index_name=index_name,
        skillset_name=skillset_name,
        field_mappings=field_mappings,
        output_field_mappings=output_field_mappings,
        schedule=IndexingSchedule(interval="PT15M"),
        parameters=indexing_parameters
    )
    indexer_client.create_indexer(indexer)
    print(f"Indexer '{indexer_name}' created.")

Try these adjustments and see if they resolve the issue. If the problem persists, you might want to check the detailed error message in the Azure portal for more specific information.

Share via

Documentexctractionskill issue in azure AI search

0 additional answers