Create the on-demand job
Replace
EXTRACTION_PROMPT with your extraction prompt, and INPUT_DIR with the path to your local directory of files to process. The response includes the job ID.Each on-demand job is limited to 10 files, and each file is limited to 50 MB in size.If you need to launch a series of on-demand jobs in rapid succession, you must wait at least one second between launch
requests. Otherwise, you will receive a rate limit error.A maximum of 5 on-demand jobs can be running in your Unstructured account. If you launch a new on-demand job
but 5 existing on-demand jobs are still running, the new on-demand job will remain in a scheduled state until one of the 5
existing on-demand jobs is done running.
- curl
- Python
Save and run this script:This script requires jq to parse the JSON response.
#!/usr/bin/env bash
EXTRACTION_PROMPT="Represent dates such as May-12-24 as 2024-05-12 and June-12-25 as 2025-06-12."
INPUT_DIR="/full/path/to/your/directory"
form_args=()
for filepath in "$INPUT_DIR"/*; do
[ -f "$filepath" ] || continue
filename=$(basename "$filepath")
mimetype=$(file --mime-type -b "$filepath")
form_args+=(--form "input_files=@${filepath};filename=${filename};type=${mimetype}")
done
json_schema='{"type":"object","properties":{"invoice_number":{"type":"number"},"invoice_date":{"type":"string"},"payment_due":{"type":"string"},"bill_to":{"type":"string"}},"additionalProperties":false,"required":["invoice_number","invoice_date","payment_due","bill_to"]}'
request_data=$(jq -n --arg prompt "$EXTRACTION_PROMPT" --arg schema "$json_schema" '{
"job_nodes": [
{"name":"Partitioner","type":"partition","subtype":"vlm","settings":{"is_dynamic":true,"allow_fast":true}},
{"name":"Extractor","type":"structured_data_extractor","subtype":"llm","settings":{"schema_to_extract":{"json_schema":$schema,"extraction_guidance":$prompt},"provider":"openai","model":"gpt-5-mini","output_mode":"extracted_data_only"}}
]
}')
response=$(curl --request POST --location \
"$UNSTRUCTURED_API_URL/jobs/" \
--header "accept: application/json" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \
--form "request_data=$request_data" \
"${form_args[@]}")
JOB_ID=$(echo "$response" | jq -r '.id')
echo "Job ID: $JOB_ID"
import mimetypes
import os
import json
from unstructured_client import UnstructuredClient
from unstructured_client.models.operations import CreateJobRequest
from unstructured_client.models.shared import BodyCreateJob, InputFiles
EXTRACTION_PROMPT = "<your-extraction-prompt>"
INPUT_DIR = "/full/path/to/your/directory"
client = UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL")
)
input_files = []
for filename in os.listdir(INPUT_DIR):
full_path = os.path.join(INPUT_DIR, filename)
if not os.path.isfile(full_path):
continue
content_type, _ = mimetypes.guess_type(full_path)
input_files.append(
InputFiles(
content=open(full_path, "rb"),
file_name=filename,
content_type=content_type or "application/octet-stream"
)
)
response = client.jobs.create_job(
request=CreateJobRequest(
body_create_job=BodyCreateJob(
request_data=json.dumps({
"job_nodes": [
{
"name": "Partitioner",
"type": "partition",
"subtype": "vlm",
"settings": {
"is_dynamic": True,
"allow_fast": True
}
},
{
"name": "Extractor",
"type": "structured_data_extractor",
"subtype": "llm",
"settings": {
"schema_to_extract": {
"json_schema": {
"type": "object",
"properties": {
"invoice_number": { "type": "number" },
"invoice_date": { "type": "string" },
"payment_due": { "type": "string" },
"bill_to": { "type": "string" }
},
"additionalProperties": False,
"required": [
"invoice_number",
"invoice_date",
"payment_due",
"bill_to"
]
},
"extraction_guidance": EXTRACTION_PROMPT
},
"provider": "openai",
"model": "gpt-5-mini",
"output_mode": "extracted_data_only"
}
}
]
}),
input_files=input_files
)
)
)
job_info = response.job_information
print(f"Job ID: {job_info.id}")
Poll for job status
Replace
JOB_ID with the job ID from the previous step. This script polls every 10 seconds and stops when the job completes.- curl
- Python
Save and run this script:This script requires jq to parse the JSON response.
#!/usr/bin/env bash
JOB_ID="<job-id>"
while true; do
job=$(curl --request GET --silent --location \
"$UNSTRUCTURED_API_URL/jobs/$JOB_ID" \
--header "accept: application/json" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY")
status=$(echo "$job" | jq -r '.status')
echo "Job status: $status"
if [ "$status" = "COMPLETED" ]; then
echo "Job completed."
echo "Output node file IDs: $(echo "$job" | jq -c '[.output_node_files[].file_id]')"
break
elif [ "$status" = "FAILED" ] || [ "$status" = "STOPPED" ]; then
echo "Job did not complete successfully: $status"
exit 1
fi
sleep 10
done
import os
import time
from unstructured_client import UnstructuredClient
JOB_ID = "<job-id>"
client = UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL")
)
while True:
response = client.jobs.get_job(request={"job_id": JOB_ID})
job_info = response.job_information
status = job_info.status
print(f"Job status: {status.value}")
if status == "COMPLETED":
print("Job completed.")
print(f"Output node file IDs: {[f.file_id for f in (job_info.output_node_files or [])]}")
break
elif status in ("FAILED", "STOPPED"):
raise RuntimeError(f"Job did not complete successfully: {status}")
time.sleep(10)
Download the job output
Replace
JOB_ID, OUTPUT_FILE_IDS, and OUTPUT_DIR with your values from the previous steps.- curl
- Python
Save and run this script:
#!/usr/bin/env bash
JOB_ID="<job-id>"
OUTPUT_FILE_IDS=("<output-file-id>" "<output-file-id>") # From Step 1
OUTPUT_DIR="/full/path/to/your/output/directory"
mkdir -p "$OUTPUT_DIR"
for file_id in "${OUTPUT_FILE_IDS[@]}"; do
curl --request GET --silent --location \
"$UNSTRUCTURED_API_URL/jobs/$JOB_ID/download?file_id=$file_id" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \
--output "$OUTPUT_DIR/$file_id.json"
echo "Saved: $OUTPUT_DIR/$file_id.json"
done
import json
import os
from unstructured_client import UnstructuredClient
from unstructured_client.models.operations import DownloadJobOutputRequest
JOB_ID = "<job-id>"
OUTPUT_FILE_IDS = ["<output-file-id>", "<output-file-id>"] # From Step 1
OUTPUT_DIR = "/full/path/to/your/output/directory"
client = UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL")
)
os.makedirs(OUTPUT_DIR, exist_ok=True)
for file_id in OUTPUT_FILE_IDS:
response = client.jobs.download_job_output(
request=DownloadJobOutputRequest(job_id=JOB_ID, file_id=file_id)
)
output_path = os.path.join(OUTPUT_DIR, file_id, ".json")
with open(output_path, "w") as f:
json.dump(response.any, f, indent=4)
print(f"Saved: {output_path}")
Complete end-to-end script
ReplaceEXTRACTION_PROMPT, INPUT_DIR, and OUTPUT_DIR with your values, then save and run this script.
- curl
- Python
This script requires jq to parse JSON responses.
#!/usr/bin/env bash
EXTRACTION_PROMPT="Represent dates such as May-12-24 as 2024-05-12 and June-12-25 as 2025-06-12."
INPUT_DIR="/full/path/to/your/input/directory"
OUTPUT_DIR="/full/path/to/your/output/directory"
# Step 1: Create the on-demand job.
form_args=()
for filepath in "$INPUT_DIR"/*; do
[ -f "$filepath" ] || continue
filename=$(basename "$filepath")
mimetype=$(file --mime-type -b "$filepath")
form_args+=(--form "input_files=@${filepath};filename=${filename};type=${mimetype}")
done
json_schema='{"type":"object","properties":{"invoice_number":{"type":"number"},"invoice_date":{"type":"string"},"payment_due":{"type":"string"},"bill_to":{"type":"string"}},"additionalProperties":false,"required":["invoice_number","invoice_date","payment_due","bill_to"]}'
request_data=$(jq -n --arg prompt "$EXTRACTION_PROMPT" --arg schema "$json_schema" '{
"job_nodes": [
{"name":"Partitioner","type":"partition","subtype":"vlm","settings":{"is_dynamic":true,"allow_fast":true}},
{"name":"Extractor","type":"structured_data_extractor","subtype":"llm","settings":{"schema_to_extract":{"json_schema":$schema,"extraction_guidance":$prompt},"provider":"openai","model":"gpt-5-mini","output_mode":"extracted_data_only"}}
]
}')
response=$(curl --request POST --location \
"$UNSTRUCTURED_API_URL/jobs/" \
--header "accept: application/json" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \
--form "request_data=$request_data" \
"${form_args[@]}")
JOB_ID=$(echo "$response" | jq -r '.id')
echo "Job ID: $JOB_ID"
# Step 2: Poll until the job completes.
output_file_ids=()
while true; do
job=$(curl --request GET --silent --location \
"$UNSTRUCTURED_API_URL/jobs/$JOB_ID" \
--header "accept: application/json" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY")
status=$(echo "$job" | jq -r '.status')
echo "Job status: $status"
if [ "$status" = "COMPLETED" ]; then
echo "Job completed."
while IFS= read -r id; do
output_file_ids+=("$id")
done < <(echo "$job" | jq -r '.output_node_files[].file_id')
break
elif [ "$status" = "FAILED" ] || [ "$status" = "STOPPED" ]; then
echo "Job did not complete successfully: $status"
exit 1
fi
sleep 10
done
# Step 3: Download the job output.
mkdir -p "$OUTPUT_DIR"
for file_id in "${output_file_ids[@]}"; do
curl --request GET --silent --location \
"$UNSTRUCTURED_API_URL/jobs/$JOB_ID/download?file_id=$file_id" \
--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \
--output "$OUTPUT_DIR/$file_id.json"
echo "Saved: $OUTPUT_DIR/$file_id.json"
done
import json
import mimetypes
import os
import time
from unstructured_client import UnstructuredClient
from unstructured_client.models.operations import CreateJobRequest, DownloadJobOutputRequest
from unstructured_client.models.shared import BodyCreateJob, InputFiles
EXTRACTION_PROMPT = "<your-extraction-prompt>"
INPUT_DIR = "/full/path/to/your/input/directory"
OUTPUT_DIR = "/full/path/to/your/output/directory"
client = UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL")
)
# Step 1: Create the on-demand job.
input_files = []
for filename in os.listdir(INPUT_DIR):
full_path = os.path.join(INPUT_DIR, filename)
if not os.path.isfile(full_path):
continue
content_type, _ = mimetypes.guess_type(full_path)
input_files.append(
InputFiles(
content=open(full_path, "rb"),
file_name=filename,
content_type=content_type or "application/octet-stream"
)
)
response = client.jobs.create_job(
request=CreateJobRequest(
body_create_job=BodyCreateJob(
request_data=json.dumps({
"job_nodes": [
{
"name": "Partitioner",
"type": "partition",
"subtype": "vlm",
"settings": {
"is_dynamic": True,
"allow_fast": True
}
},
{
"name": "Extractor",
"type": "structured_data_extractor",
"subtype": "llm",
"settings": {
"schema_to_extract": {
"json_schema": {
"type": "object",
"properties": {
"invoice_number": { "type": "number" },
"invoice_date": { "type": "string" },
"payment_due": { "type": "string" },
"bill_to": { "type": "string" }
},
"additionalProperties": False,
"required": [
"invoice_number",
"invoice_date",
"payment_due",
"bill_to"
]
},
"extraction_guidance": EXTRACTION_PROMPT
},
"provider": "openai",
"model": "gpt-5-mini",
"output_mode": "extracted_data_only"
}
}
]
}),
input_files=input_files
)
)
)
job_id = response.job_information.id
print(f"Job ID: {job_id}")
# Step 2: Poll until the job completes.
while True:
response = client.jobs.get_job(request={"job_id": job_id})
job_info = response.job_information
status = job_info.status
print(f"Job status: {status.value}")
if status == "COMPLETED":
print("Job completed.")
break
elif status in ("FAILED", "STOPPED"):
raise RuntimeError(f"Job did not complete successfully: {status}")
time.sleep(10)
output_node_file_ids = [f.file_id for f in (job_info.output_node_files or [])]
# Step 3: Download the job output.
os.makedirs(OUTPUT_DIR, exist_ok=True)
for file_id in output_node_file_ids:
response = client.jobs.download_job_output(
request=DownloadJobOutputRequest(job_id=job_id, file_id=file_id)
)
output_path = os.path.join(OUTPUT_DIR, f"{file_id}.json")
with open(output_path, "w") as f:
json.dump(response.any, f, indent=4)
print(f"Saved: {output_path}")
What’s next?
- Create on-demand jobs that only partition, add enrichments, or add embeddings.
- Learn about the Unstructured API’s other workflow operations.

