Augmenting RSDF output from SAP BW

1. Import Libraries:

import os
import re
import pyhdb
import pandas as pd
import xxx_dbconfig as config
from lxml import html
from bs4 import BeautifulSoup
hanadb = {'host': "",
'port': *****,
'user': "********",
'passwd': "*********"}

2. Function to build list of HTML files

def get_inputhtml(base_path, print_on):
input_path = base_path + '/input_html/'
folder = os.fsencode(input_path)
filenames = []
# loop through 'input_html' folder for all HTML files
for file in os.listdir(folder):
filename = os.fsdecode(file)
if filename.endswith( ('.HTML') ):

3. Function to extract the soup

def get_soup(web_template, base_path, print_on): #read an HTML file and construct the soup object
input_path = base_path + '/input_html/'
f = open(web_template)
soup = BeautifulSoup(f, 'html5lib')
if print_on == 'X':
print('Finished getting soup ...')

4. Function to find Transformations in the soup

def get_transformations(soup, print_on): #scan soup object and collect unique transformations
# find all transformations in html doc
trfms = []
for element in soup.find_all(text=re.compile('Transformation:')):
parentTable = element.findParent('table')
technm_cell = parentTable.find(text=re.compile('Technical Name:'))
trfm = technm_cell.findNext('td')
if print_on == 'X':
print('Finished getting transformations ...')

5. Function to clean up the soup

def clean_soup(soup, print_on, rem_extra_detail): #clean up various links
# remove extra detail ['Required Objects', 'ADSO Flags', 'Relays data to', 'Receives data from', 'Used by', 'Data Transfer Process']
if rem_extra_detail == 'X':
extra_detail_elements = ['Required Objects', 'ADSO Flags', 'Relays data to', 'Receives data from', 'Used by', 'Data Transfer Process']
for extra_detail in extra_detail_elements:
for block in soup.find_all(text=re.compile(extra_detail)):
if extra_detail == 'Data Transfer Process':
parentTable = block.findParent('table')
parentTbody = block.findParent('tbody')
if print_on == 'X':
print('Removed blocks containing: ' + extra_detail)
# remove broken links from the first 'Reqired Objects' table (if not already removed above)
indx = 0
for broken_links in soup.find_all(text=re.compile('Required Objects')):
if indx == 0:
bl_parentTbody = broken_links.findParent('tbody')
for bl_match in bl_parentTbody.find_all('a'):
indx += 1
# remove spans that contain 'FORMULA'
for formula_spans in soup.find_all(text=re.compile('Formula:')):
parentSpan = formula_spans.findParent('span')
# repair/reformat FORMULA links
invalid_tags = ['a', 'b', 'u']
for tag in invalid_tags:
for formula_links in soup.find_all(text=re.compile('FORMULA')):
parentTd = formula_links.findParent('td')
for match in parentTd.findAll(tag):
if print_on == 'X':
print('Cleaned the soup ...')

6. Function to retrive the missing detail from SAP BW

def get_data(trfms, print_on): #get active_formulas and active_tranfields
# load data from HANA db
connection = pyhdb.connect(host=config.hanadb['host'],
hana_list = str(trfms).replace('[', '(').replace(']', ')')
cursor = connection.cursor()
#Deliberately not relying on a single view to avoid HANA dependencies
#Get active formulas for all transformations in the document
where_clause = '(("TRANID" in ' + hana_list + ') AND ("OBJVERS" = \'A\') AND ( "KIND" = \'FORMULA\') )'
stmnt = 'SELECT * from SAPBWH.RSTRANSTEPROUT WHERE ' + where_clause
active_formulas = cursor.fetchall()
#Get ibjs for rule steps above
where_clause = '(("OBJVERS" = \'A\') AND ("TRANID" in ' + hana_list + ') AND ("PARAMTYPE" = \'1\'))'
stmnt = 'SELECT * FROM "SAPBWH"."RSTRANFIELD" WHERE ' + where_clause
active_tranfields = cursor.fetchall()
#Get rule step texts (used for formulas)
where_clause = '( ("LANGU" = \'EN\') AND ("TRANID" in ' + hana_list + ') AND ("OBJVERS" = \'A\') )'
stmnt = 'SELECT * FROM "SAPBWH"."RSTRANRULET" WHERE ' + where_clause
active_trantexts = cursor.fetchall()

#!!*** todo: get Descriptions ***!!
#for tran_step in active_formulas:
#active_formulas_wtxts = [ft for ft in active_trantexts if ft[4] == tran_step[]]
#!!*** ******************* ***!!

if print_on == 'X':
print('Retrieved data from HANA ...')
return(active_formulas, active_tranfields)

7. Function to inject the Formula detail into the HTML

def inject_formulas(trfms, soup, active_formulas, active_tranfields, print_on):
# loop over transformations and link formula code, and insert
for trfmation in trfms:
if print_on == 'X':
print('Trfmtn: ' + trfmation)
formula_steps = [sfs for sfs in active_formulas if sfs[0] == trfmation]
for single_formula_step in formula_steps:
#formula_descr = 'Description: ' +
formula_cd = 'Formula: ' + str(single_formula_step[7])
single_trfm_rules = [y for y in active_tranfields if y[0] == trfmation]
single_step = [y for y in single_trfm_rules if y[3] == single_formula_step[2]]
iobj_id = single_step[0][6]
if print_on == 'X':
print('IObj: ' + iobj_id)
span_txt = formula_cd
span_id = 'formula_' + iobj_id
new_span = soup.new_tag('div', id=span_id)
new_span.string = str(span_txt)
new_br = soup.new_tag('br')
new_span.insert(0, new_br)
for element2 in soup.find_all(text=re.compile('Transformation:')):
parentTable2 = element2.findParent('table')
curr_trfm_cell = parentTable2.find(text=re.compile('Technical Name:'))
curr_trfm_id = curr_trfm_cell.findNext('td').text
if curr_trfm_id == trfmation:
group_table = parentTable2.find(text=re.compile('Group')).find_parent('table')
iobj_element = group_table.find(text=re.compile(str(iobj_id)))
parentTbody = iobj_element.find_parent('tbody')
if print_on == 'X':
print('Finished injecting Formulas ...')

8. Function to write the corrected HTML to file

def write_output(soup, base_path, print_on):
# output modified html
html_output_path = base_path + '/output_html/'
html = soup.prettify("latin-1")
output_filename = input_fileName + "_output." + fileType
if print_on == 'X':
print('Writing file: ' + output_filename)
with open(output_filename, "wb") as file:
if print_on == 'X':
print('Finished writing file: ' + output_filename)

9. Function to convert HTML files to PDFs

def convert_pdf(base_path, print_on):
#import libraries
import pdfkit
filenames = []
# pdf_input_path is the output path of the HTML documents
pdf_input_path = base_path + '/output_html/'
folder = os.fsencode(pdf_input_path)
# loop through source folder for all HTML files
for file in os.listdir(folder):
filename = os.fsdecode(file)
if filename.endswith( ('_output.HTML') ):
if print_on == 'X':
print (' ')
print('Start of PDF conversion ...')
# For every HTML file: produce PDF
pdf_output_path = base_path + '/output_pdf/'
folder = os.fsencode(pdf_output_path)
for file in filenames:
pdf_filename = fileName + "_" + fileType + ".pdf"
if print_on == 'X':
print ("-> Processing: " + file)
pdfkit.from_file(pdf_input_path + file, pdf_output_path + pdf_filename)
if print_on == 'X':
print ("... copied to: " + pdf_filename)
print(' ')
if print_on == 'X':
print (' ')
print('End of PDF conversion ...')

10. Main script block

#---***---   Main Code Block  ---***---
# start of flags section for controlling execution
rem_extra_detail = 'X' #superfluous detail flag
print_on = 'X' #print messages flag
pdf_on = 'X' #conversion to PDF flag
# end of flags section for controlling execution
# get local path
base_path = os.getcwd()
# get list of HTML files to process
filenames = get_inputhtml(base_path, print_on)
# loop through relevant files
for web_template in filenames:
# control messaging
if print_on == 'X':
print(' ')
print('--> file: ' + web_template)
print(' ')

# convert html to BeautifulSoup object
soup = get_soup(web_template, base_path, print_on)

# extract transformations from the soup
transformations = get_transformations(soup, print_on)

# clean-up the soup, removing detail as required
soup_mod1 = clean_soup(soup, print_on, rem_extra_detail)

# query SAP HANA to return Formula details
active_formulas, active_tranfields = get_data(transformations, print_on)

# construct and then inject 'formula' HTML and write to output HTML
soup_mod2 = inject_formulas(transformations, soup_mod1, active_formulas, active_tranfields, print_on)
write_output(soup_mod2, base_path, print_on)

# if required, output to PDF
if pdf_on == 'X':
convert_pdf(base_path, print_on)
print(' ... and that\'s a wrap!')
  1. Get the list if HTML files
  2. For each file do the following:
  • Convert html to BeautifulSoup object
  • Extract transformations from the soup
  • Clean-up the soup, removing detail as required
  • Query SAP HANA to return Formula details
  • Construct and then inject ‘formula’ HTML and write to output HTML
  • Finally: if required, output to PDF


Full code:




Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Fast Fix for your Mac m1 / Fios Router Speed Issues

Rudiments of Containerization

Home DevOps Pipeline: A junior engineer’s tale (3/4)

Advantages of Earning VMware 1V0-41.20 Certification

Chatbot in Flutter with Dialogflow

Yes I Said It, RIP Mobile Apps

Deploying Kubernetes on AWS

Smoke testing in the cloud

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Andrew Jabbitt

Andrew Jabbitt

More from Medium

CS 373 Spring 2022 — Week 4

Everything about Solice LANDS

Album ReviewGeese — Projector |

ANNOUNCEMENT: War of Tribes Tiered System Passive Rewards & Mintpass is now LIVE!