sehatech-demo / test_download_data.sh
larawehbe's picture
Upload folder using huggingface_hub
965ac15 verified
#!/bin/bash
# Specify the number of articles to download
limit=10
# Fetch the list of articles with metadata in XML format
response=$(curl -s "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf&limit=$limit")
# Parse each record in the response
echo "$response" | while read -r line; do
# Extract the PMC ID
if [[ $line =~ id=\"(PMC[0-9]+)\" ]]; then
pmc_id="${BASH_REMATCH[1]}"
echo "Processing article ID: $pmc_id"
# Extract the title for metadata
title=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*citation="\(.*\)".*/\1/p')
# Extract the PDF link for download
pdf_link=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*<link format="pdf"[^>]* href="\([^"]*\)".*/\1/p')
# Check if we found a PDF link
if [[ -n $pdf_link ]]; then
# Print metadata
echo "Title: $title"
echo "Downloading PDF from: $pdf_link"
# Download the PDF
curl -O "$pdf_link"
# Optional: Save metadata to a file
echo "Title: $title" >> metadata.txt
echo "PDF Link: $pdf_link" >> metadata.txt
echo "---------------------" >> metadata.txt
else
echo "No PDF link found for article ID: $pmc_id"
fi
fi
done