No articles found
Try different keywords or browse our categories
Generate and Download XML Sitemap with Python
Learn how to create sitemap.xml files in Python. Download sitemaps from websites and generate custom XML sitemaps for SEO.
Sitemaps help search engines discover and index your website pages. Here’s how to generate and download sitemap XML files using Python.
Generate Basic Sitemap
No external libraries needed. Uses Python’s built-in XML library.
from datetime import datetime
import xml.etree.ElementTree as ET
def generate_sitemap(urls, filename='sitemap.xml'):
# Create root element
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
# Add URLs
for url_data in urls:
url_elem = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url_elem, 'loc')
loc.text = url_data['loc']
if 'lastmod' in url_data:
lastmod = ET.SubElement(url_elem, 'lastmod')
lastmod.text = url_data['lastmod']
if 'changefreq' in url_data:
changefreq = ET.SubElement(url_elem, 'changefreq')
changefreq.text = url_data['changefreq']
if 'priority' in url_data:
priority = ET.SubElement(url_elem, 'priority')
priority.text = str(url_data['priority'])
# Create tree and write
tree = ET.ElementTree(urlset)
ET.indent(tree, space=' ')
tree.write(filename, encoding='utf-8', xml_declaration=True)
print(f"Sitemap saved to {filename}")
# Example usage
urls = [
{
'loc': 'https://example.com/',
'lastmod': datetime.now().strftime('%Y-%m-%d'),
'changefreq': 'daily',
'priority': 1.0
},
{
'loc': 'https://example.com/about',
'lastmod': '2024-12-30',
'changefreq': 'monthly',
'priority': 0.8
},
{
'loc': 'https://example.com/blog',
'changefreq': 'weekly',
'priority': 0.9
}
]
generate_sitemap(urls)
Download Sitemap from URL
Fetch and save sitemaps from websites.
import requests
def download_sitemap(url, filename='downloaded_sitemap.xml'):
try:
response = requests.get(url)
response.raise_for_status()
with open(filename, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"Sitemap downloaded to {filename}")
except requests.exceptions.RequestException as e:
print(f"Error downloading sitemap: {e}")
# Download sitemap
download_sitemap('https://example.com/sitemap.xml')
Parse Existing Sitemap
Extract URLs from sitemap XML files.
import xml.etree.ElementTree as ET
def parse_sitemap(filename):
tree = ET.parse(filename)
root = tree.getroot()
# Handle namespace
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = []
for url in root.findall('sm:url', ns):
loc = url.find('sm:loc', ns)
lastmod = url.find('sm:lastmod', ns)
priority = url.find('sm:priority', ns)
urls.append({
'loc': loc.text if loc is not None else None,
'lastmod': lastmod.text if lastmod is not None else None,
'priority': priority.text if priority is not None else None
})
return urls
# Parse and display
urls = parse_sitemap('sitemap.xml')
for url in urls:
print(f"{url['loc']} (Priority: {url['priority']})")
Generate from Directory
Scan directory and create sitemap automatically.
import os
from datetime import datetime
import xml.etree.ElementTree as ET
def generate_from_directory(directory, base_url, filename='sitemap.xml'):
urls = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.html'):
# Get relative path
rel_path = os.path.relpath(
os.path.join(root, file),
directory
)
# Convert to URL
url = base_url.rstrip('/') + '/' + rel_path.replace('\\', '/')
# Handle index.html
if file == 'index.html':
url = url.replace('index.html', '')
# Get modification time
file_path = os.path.join(root, file)
mod_time = os.path.getmtime(file_path)
lastmod = datetime.fromtimestamp(mod_time).strftime('%Y-%m-%d')
urls.append({
'loc': url,
'lastmod': lastmod,
'changefreq': 'weekly',
'priority': 0.8
})
# Generate sitemap
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for url_data in urls:
url_elem = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url_elem, 'loc')
loc.text = url_data['loc']
lastmod = ET.SubElement(url_elem, 'lastmod')
lastmod.text = url_data['lastmod']
changefreq = ET.SubElement(url_elem, 'changefreq')
changefreq.text = url_data['changefreq']
priority = ET.SubElement(url_elem, 'priority')
priority.text = str(url_data['priority'])
tree = ET.ElementTree(urlset)
ET.indent(tree, space=' ')
tree.write(filename, encoding='utf-8', xml_declaration=True)
print(f"Generated sitemap with {len(urls)} URLs")
# Generate from dist folder
generate_from_directory('./dist', 'https://example.com')
Sitemap Index
Create sitemap index for multiple sitemaps.
import xml.etree.ElementTree as ET
from datetime import datetime
def generate_sitemap_index(sitemaps, filename='sitemap-index.xml'):
sitemapindex = ET.Element('sitemapindex')
sitemapindex.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for sitemap_url in sitemaps:
sitemap = ET.SubElement(sitemapindex, 'sitemap')
loc = ET.SubElement(sitemap, 'loc')
loc.text = sitemap_url
lastmod = ET.SubElement(sitemap, 'lastmod')
lastmod.text = datetime.now().strftime('%Y-%m-%d')
tree = ET.ElementTree(sitemapindex)
ET.indent(tree, space=' ')
tree.write(filename, encoding='utf-8', xml_declaration=True)
print(f"Sitemap index saved to {filename}")
# Create index
sitemaps = [
'https://example.com/sitemap-posts.xml',
'https://example.com/sitemap-pages.xml',
'https://example.com/sitemap-products.xml'
]
generate_sitemap_index(sitemaps)
Validate Sitemap
Check if sitemap is properly formatted.
import xml.etree.ElementTree as ET
def validate_sitemap(filename):
try:
tree = ET.parse(filename)
root = tree.getroot()
# Check namespace
if 'sitemaps.org/schemas/sitemap' not in root.tag:
print("Invalid sitemap namespace")
return False
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = root.findall('sm:url', ns)
print(f"Valid sitemap with {len(urls)} URLs")
# Check each URL
for idx, url in enumerate(urls, 1):
loc = url.find('sm:loc', ns)
if loc is None or not loc.text:
print(f"URL {idx}: Missing loc element")
return False
print("All URLs have valid loc elements")
return True
except ET.ParseError as e:
print(f"XML parsing error: {e}")
return False
except Exception as e:
print(f"Validation error: {e}")
return False
# Validate
validate_sitemap('sitemap.xml')
Dynamic Sitemap from Database
Generate sitemap from database records.
import sqlite3
from datetime import datetime
import xml.etree.ElementTree as ET
def generate_from_database(db_path, base_url, filename='sitemap.xml'):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Query posts (adjust for your schema)
cursor.execute("""
SELECT slug, updated_at, category
FROM posts
WHERE published = 1
""")
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for row in cursor.fetchall():
slug, updated_at, category = row
url_elem = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url_elem, 'loc')
loc.text = f"{base_url}/blog/{slug}"
lastmod = ET.SubElement(url_elem, 'lastmod')
lastmod.text = updated_at
priority = ET.SubElement(url_elem, 'priority')
priority.text = '0.8'
conn.close()
tree = ET.ElementTree(urlset)
ET.indent(tree, space=' ')
tree.write(filename, encoding='utf-8', xml_declaration=True)
print(f"Generated sitemap from database")
# Generate from SQLite database
generate_from_database('blog.db', 'https://example.com')
Compress Sitemap
Compress large sitemaps with gzip.
import gzip
import xml.etree.ElementTree as ET
def generate_compressed_sitemap(urls, filename='sitemap.xml.gz'):
# Create XML
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for url_data in urls:
url_elem = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url_elem, 'loc')
loc.text = url_data['loc']
# Convert to string
tree = ET.ElementTree(urlset)
ET.indent(tree, space=' ')
# Write compressed
with gzip.open(filename, 'wt', encoding='utf-8') as f:
tree.write(f, encoding='unicode', xml_declaration=True)
print(f"Compressed sitemap saved to {filename}")
# Generate compressed sitemap
urls = [{'loc': f'https://example.com/page-{i}'} for i in range(1000)]
generate_compressed_sitemap(urls)
Complete CLI Tool
Full command-line sitemap generator.
import argparse
import xml.etree.ElementTree as ET
from datetime import datetime
def create_sitemap_cli():
parser = argparse.ArgumentParser(description='Generate XML sitemap')
parser.add_argument('--url', required=True, help='Base URL')
parser.add_argument('--pages', nargs='+', required=True, help='Page paths')
parser.add_argument('--output', default='sitemap.xml', help='Output file')
args = parser.parse_args()
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for page in args.pages:
url_elem = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url_elem, 'loc')
loc.text = args.url.rstrip('/') + '/' + page.lstrip('/')
lastmod = ET.SubElement(url_elem, 'lastmod')
lastmod.text = datetime.now().strftime('%Y-%m-%d')
tree = ET.ElementTree(urlset)
ET.indent(tree, space=' ')
tree.write(args.output, encoding='utf-8', xml_declaration=True)
print(f"Generated {args.output} with {len(args.pages)} URLs")
if __name__ == '__main__':
create_sitemap_cli()
Run from terminal:
python sitemap_generator.py --url https://example.com --pages / about blog contact
Quick Reference
Basic Structure:
<urlset>- Root element with namespace<url>- Container for each URL entry<loc>- URL location (required)<lastmod>- Last modification date (optional)<changefreq>- Update frequency (optional)<priority>- Priority 0.0 to 1.0 (optional)
Change Frequency Values:
always- Changes every accesshourly- Changes hourlydaily- Changes dailyweekly- Changes weeklymonthly- Changes monthlyyearly- Changes yearlynever- Archived content
Best Practices:
- Limit to 50,000 URLs per sitemap
- Max file size 50MB (uncompressed)
- Use sitemap index for larger sites
- Include only canonical URLs
- Use absolute URLs with protocol
- Compress large sitemaps with gzip
Common Issues:
- Wrong namespace causes validation errors
- Missing
<?xml>declaration - Using relative URLs instead of absolute
- Special characters not escaped
- Incorrect date format (use YYYY-MM-DD)
Conclusion
Python makes sitemap generation simple with its built-in XML library. Use the basic generator for static sites, database queries for dynamic content, or directory scanning for file-based sites. All these methods create valid XML sitemaps that search engines can crawl.
Related Articles
Generate Excel Files from Raw Data with Python
Quick guide to creating Excel files from raw data using Python. Learn to use openpyxl, xlsxwriter, and pandas for Excel generation.
Python FFMPEG Integration: Edit Videos in Terminal
Master video editing from the command line using Python and FFmpeg. Learn to trim, merge, compress, and manipulate videos programmatically.
Read and Write CSV Files with Python
Simple guide to reading and writing CSV files in Python using csv module and pandas. Quick examples for data processing.