Python script fails to write to confluence with a lot of data

Get involved · November 12, 2019

I wrote a python script that seemed to work pretty well. It lists EC2 instances in AWS and then writes them to our Confluence wiki.

If it processes one environment with 10 servers it works and writes to Confluence. If it works against an account with 100 or more servers it fails to write to Confluence with this stack trace:

Traceback (most recent call last):
File ".\aws_ec2_list_instances_orig.py", line 550, in <module>
main()
File ".\aws_ec2_list_instances_orig.py", line 543, in main
write_data_to_confluence(auth, html, pageid, title)
File ".\aws_ec2_list_instances_orig.py", line 391, in write_data_to_confluence
r.raise_for_status()
File "C:\Users\tdunphy\AppData\Roaming\Python\Python37\site-packages\requests\models.py", line 940, in raise_for_status
requests.exceptions.HTTPError: 400 Client Error: for url: https://wiki.us.cworld.company.com/rest/api/content/138317098

I've also raised a verbose error here:

Traceback (most recent call last):
File ".\aws_ec2_list_instances_orig.py", line 538, in <module>
main()
File ".\aws_ec2_list_instances_orig.py", line 531, in main
write_data_to_confluence(auth, html, pageid, title)
File ".\aws_ec2_list_instances_orig.py", line 380, in write_data_to_confluence
raise RuntimeError(r.content)
RuntimeError: b'{"statusCode":400,"data":{"authorized":false,"valid":true,"allowedInReadOnlyMode":true,"errors":[],"successful":false},"message":"Error parsing xhtml: Unexpected character \'<\' (code 60); expected a semi-colon after the reference for entity \'C\'\\n at [row,col {unknown-source}]: [1,46579]","reason":"Bad Request"}'

Please note I AM NOT ALLOWED TO POST THE COMPANY DOMAIN IN MY POSTS. I will substitute 'company.com' where my real company domain would be.

Here is the script:

#!/usr/bin/env python3

# Import modules
import boto3
import time
import objectpath
import csv
import os
import sys
import json
import requests
from requests_kerberos import HTTPKerberosAuth
import codecs
from datetime import datetime
from os.path import basename
from subprocess import check_output,CalledProcessError,PIPE

BASE_URL = "https://wiki.us.cworld.company.com/rest/api/content"
VIEW_URL = "https://wiki.us.cworld.company.com/pages/viewpage.action?pageId="

def banner(message, border='-'):
line = border * len(message)
print(line)
print(message)
print(line)

def initialize(interactive, aws_account):
# Set the date
today = datetime.today()
today = today.strftime("%m-%d-%Y")
# Set source files
aws_env_list="../../source_files/aws_environments/aws_environments_all.txt"
output_dir = "../../output_files/aws_instance_list/csv/"
output_file = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
output_file_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
return today, aws_env_list, output_file, output_file_name

def authenticate():
#auth = get_login()
auth = ('tdunphy', 'local4tl4nt1cNJ!')
auth = str(auth).replace('(','').replace('\'','').replace(',',':').replace(')','').replace(' ','')
kerberos_auth = HTTPKerberosAuth(mutual_authentication="DISABLED",principal=auth)
auth = kerberos_auth
return auth

## These are dummy AWS account numbers. I cannot post account number for my company.
def aws_accounts_to_account_numbers(aws_account):
switcher = {
'company-lab': '123456789101',
'company-bill': '123456789102',
'company-stage': '123456789103',
'company-dlab': '123456789103',
}
return switcher.get(aws_account, "nothing")


def list_instances(aws_account,aws_account_number, interactive):
today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
engagement = None
# Set the account
session = boto3.Session(profile_name=aws_account)
ec2 = session.client("ec2")
fieldnames = [ 'AWS Account', 'Account Number', 'Name', 'Instance ID', 'VPC ID', 'Type', 'Platform', 'State', 'Key Name', 'Private IP', 'Public IP', 'Private DNS', 'Volumes', 'Availability Zone', 'Launch Date', 'Engagement Code']
# Set the ec2 dictionary
ec2info = {}
public_ips_list = ''
private_ips_list = ''
private_dns = None
with open(output_file, mode='w+') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
writer.writeheader()
if 'gov' in aws_account and not 'admin' in aws_account:
print("This is a gov account.")
session = boto3.Session(profile_name=aws_account,region_name='us-gov-west-1')
else:
print("This is a commercial account.")
session = boto3.Session(profile_name=aws_account,region_name='us-east-1')

ec2 = session.client("ec2")
# Loop through the instances
instance_list = ec2.describe_instances()
for reservation in instance_list["Reservations"]:
for instance in reservation.get("Instances", []):
launch_time = instance["LaunchTime"]
launch_time_friendly = launch_time.strftime("%B %d %Y")
tree = objectpath.Tree(instance)
block_devices = set(tree.execute('$..BlockDeviceMappings[\'Ebs\'][\'VolumeId\']'))
if len(block_devices) == 0:
block_devices_list = None
else:
block_devices_list = list(block_devices)
block_devices_list = str(block_devices_list).replace('[','').replace(']','').replace('\'','').replace('{','').replace('}', '')
private_ips = set(tree.execute('$..PrivateIpAddress'))
if len(private_ips) == 0:
private_ips_list = None
else:
private_ips_list = list(private_ips)
private_ips_list = str(private_ips_list).replace('[','').replace(']','').replace('\'','')
public_ips = set(tree.execute('$..PublicIp'))
if len(public_ips) == 0:
public_ips_list = None
else:
public_ips_list = list(public_ips)
public_ips_list = str(public_ips_list).replace('[','').replace(']','').replace('\'','')
if 'KeyName' in instance:
key_name = instance['KeyName']
else:
key_name = None
name = None
if 'Tags' in instance:
try:
tags = instance['Tags']
name = None
for tag in tags:
if tag["Key"] == "Name":
name = tag["Value"]
for tag in tags:
if tag["Key"] == "Engagement":
engagement = tag["Value"]
else:
engagement = None
except ValueError:
print("Instance: %s has no tags" % instance_id)
if 'VpcId' in instance:
vpc_id = instance['VpcId']
else:
vpc_id = None
if 'PrivateDnsName' in instance:
private_dns = instance['PrivateDnsName']
else:
private_dns = None
if 'Platform' in instance:
platform = instance['Platform']
else:
platform = None
ec2info[instance['InstanceId']] = {
'AWS Account': aws_account,
'Account Number': aws_account_number,
'Name': name,
'Instance ID': instance['InstanceId'],
'VPC ID': vpc_id,
'Type': instance['InstanceType'],
'Platform': platform,
'State': instance['State']['Name'],
'Key Name': key_name,
'Private IP': private_ips_list,
'Public IP': public_ips_list,
'Private DNS': private_dns,
'Volumes': block_devices_list,
'Availability Zone': instance['Placement']['AvailabilityZone'],
'Launch Date': launch_time_friendly,
'Engagement Code': engagement
}
with open(output_file,'a') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
writer.writerow({'AWS Account': aws_account, "Account Number": aws_account_number, 'Name': name, 'Instance ID': instance["InstanceId"], 'VPC ID': vpc_id, 'Type': instance["InstanceType"], 'Platform': platform, 'State': instance["State"]["Name"], 'Key Name': key_name, 'Private IP': private_ips_list, 'Public IP': public_ips_list, 'Private DNS': private_dns, 'Volumes': block_devices, 'Availability Zone': instance['Placement']['AvailabilityZone'], 'Launch Date': launch_time_friendly, 'Engagement Code': engagement})
for instance_id, instance in ec2info.items():
print(Fore.RESET + "-------------------------------------")
for key in [
'AWS Account',
'Account Number',
'Name',
'Instance ID',
'VPC ID',
'Type',
'Platform',
'Key Name',
'State',
'Private IP',
'Public IP',
'Private DNS',
'Volumes',
'Availability Zone',
'Launch Date',
'Engagement Code'
]:
print(Fore.GREEN + "{0}: {1}".format(key, instance.get(key))) 
time.sleep(2)
print(Fore.RESET + "-------------------------------------")
with open(output_file,'a') as csv_file:
csv_file.close()
return output_file


def convert_csv_to_html_table(output_file, today, interactive, aws_account):
output_dir = "../../output_files/aws_instance_list/html/"
if interactive == 1:
htmlfile = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
htmlfile_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
else:
htmlfile = output_dir + 'aws-instance-master-list-' + today +'.html'
htmlfile_name = 'aws-instance-master-list-' + today +'.html'
count = 0
html = ''
with open(output_file,'r') as CSVFILE:
reader = csv.reader(CSVFILE)
with open(output_file,'r') as CSVFILE:
reader = csv.reader(CSVFILE)
html += "<table><tbody>"
for row in reader:
html += "<tr>"
# Process the headers
if count == 0:
for column in row:
html += "<th>%s</th>" % column
else:
# Process the data
for column in row:
html += "<td>%s</td>" % column
html += "</tr>"
count += 1
html += "</tbody></table>"
with open(htmlfile,'w+') as HTMLFILE:
HTMLFILE.write(html)
return htmlfile, htmlfile_name


def get_page_ancestors(auth, pageid):
# Get basic page information plus the ancestors property
url = '{base}/{pageid}?expand=ancestors'.format(
base = BASE_URL,
pageid = pageid)
r = requests.get(url, auth = auth)
r.raise_for_status()
return r.json()['ancestors']


def get_page_info(auth, pageid):
url = '{base}/{pageid}'.format(
base = BASE_URL,
pageid = pageid)
r = requests.get(url, auth = auth)
r.raise_for_status()
return r.json()


def write_data_to_confluence(auth, html, pageid, title = None):
info = get_page_info(auth, pageid)
ver = int(info['version']['number']) + 1
ancestors = get_page_ancestors(auth, pageid)
anc = ancestors[-1]
del anc['_links']
del anc['_expandable']
del anc['extensions']
if title is not None:
info['title'] = title
data = {
'id' : str(pageid),
'type' : 'page',
'title' : info['title'],
'version' : {'number' : ver},
'ancestors' : [anc],
'body' : {
'storage' :
{
'representation' : 'storage',
'value' : str(html)
}
}
}
data = json.dumps(data)
url = '{base}/{pageid}'.format(base = BASE_URL, pageid = pageid)
r = requests.put(
url,
data = data,
auth = auth,
headers = { 'Content-Type' : 'application/json' }
)
r.raise_for_status()
print("Wrote '%s' version %d" % (info['title'], ver))
print("URL: %s%d" % (VIEW_URL, pageid))

def main():
pageid = 138317098
title = 'AWS EC2 Instance List'
aws_account = input("Enter the name of the AWS account you'll be working in: ")
aws_account_number = aws_accounts_to_account_numbers(aws_account)
today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
output_file = list_instances(aws_account,aws_account_number, interactive)
htmlfile, htmlfile_name = convert_csv_to_html_table(output_file, today, interactive, aws_account)
with open(htmlfile, 'r', encoding='utf-8') as htmlfile:
html = htmlfile.read()
auth = authenticate()
write_data_to_confluence(auth, html, pageid, title)


if __name__ == "__main__":
main()

Why does this script fail to write to confluence only when it processes a lot of servers?

Product Q&A

Community resources

Support

Top groups

Community resources

Support

Learn

Community resources

Support

Events

Community resources

Support

Get product advice from experts

Join a community group

Advance your career with learning paths

Earn badges and rewards

Connect and share ideas at events

Python script fails to write to confluence with a lot of data

0 answers

Suggest an answer

TAGS

Atlassian Community Events