Google Search Appliance Feeds Protocol Developers Guide User Manual

Page 40

Advertising
background image

Google Search Appliance: Feeds Protocol Developer’s Guide

40

Python Implementation of Creating a base64 Encoded
Content Feed

The following create_base64_content_feeds.py script goes through all PDF files under MY_DIR and
creates a content feed for each of them that is added to the base64_pdfs.xml file. This file can then be
used to add the documents that are under MY_DIR to the index.

import base64
import os

MY_DIR = '/var/www/files/'
MY_FILE = 'base64_pdfs.xml'

def main():

files = os.listdir(MY_DIR)

if os.path.exists(MY_FILE):

os.unlink(MY_FILE)

fh = open(MY_FILE, 'wb')
fh.write('<?xml version="1.0" encoding="utf-8"?>\n')
fh.write('<!DOCTYPE gsafeed PUBLIC "-//Google//DTD GSA Feeds//EN" "">\n')
fh.write('<gsafeed>\n')
fh.write('<header>\n')
fh.write('\t<datasource>pdfs</datasource>\n')
fh.write('\t<feedtype>incremental</feedtype>\n')
fh.write('</header>\n')
fh.write('<group>\n')

for my_file in files:

if '.pdf' in my_file:

encoded_data = base64.b64encode(open(MY_DIR + my_file, 'rb').read())
fh.write('<record url="googleconnector://localhost.localdomain/' +

my_file + '" mimetype="application/pdf">\n')

fh.write('<content encoding="base64binary">' + encoded_data +

'</content>\n')

fh.write('</record>')

fh.write('</group>\n')
fh.write('</gsafeed>\n')
fh.close()
print 'Writing to file: %s' % MY_FILE

if __name__ == '__main__':

main()

Advertising