Google Search Appliance Feeds Protocol Developers Guide User Manual
Page 40

Google Search Appliance: Feeds Protocol Developer’s Guide
40
Python Implementation of Creating a base64 Encoded
Content Feed
The following create_base64_content_feeds.py script goes through all PDF files under MY_DIR and
creates a content feed for each of them that is added to the base64_pdfs.xml file. This file can then be
used to add the documents that are under MY_DIR to the index.
import base64
import os
MY_DIR = '/var/www/files/'
MY_FILE = 'base64_pdfs.xml'
def main():
files = os.listdir(MY_DIR)
if os.path.exists(MY_FILE):
os.unlink(MY_FILE)
fh = open(MY_FILE, 'wb')
fh.write('<?xml version="1.0" encoding="utf-8"?>\n')
fh.write('<!DOCTYPE gsafeed PUBLIC "-//Google//DTD GSA Feeds//EN" "">\n')
fh.write('<gsafeed>\n')
fh.write('<header>\n')
fh.write('\t<datasource>pdfs</datasource>\n')
fh.write('\t<feedtype>incremental</feedtype>\n')
fh.write('</header>\n')
fh.write('<group>\n')
for my_file in files:
if '.pdf' in my_file:
encoded_data = base64.b64encode(open(MY_DIR + my_file, 'rb').read())
fh.write('<record url="googleconnector://localhost.localdomain/' +
my_file + '" mimetype="application/pdf">\n')
fh.write('<content encoding="base64binary">' + encoded_data +
'</content>\n')
fh.write('</record>')
fh.write('</group>\n')
fh.write('</gsafeed>\n')
fh.close()
print 'Writing to file: %s' % MY_FILE
if __name__ == '__main__':
main()