# Reference Hetland p. 427 (Beginning Python From Novice to Professional)
# Other resource: Chap 24 of Python in a Nutshel (Alex Martelli)
# Raynald Levesque August 2008
from xml.sax.handler import ContentHandler
from xml.sax import parse
import spss
# Change next line to match your requirements
fpath=r'C:/Test2'
fname=r'sample.xml' # the enclosed sample data file
xmlFile = r'%(fpath)s/%(fname)s' % vars()
class DataHandler(ContentHandler):
""" Creates tab separated data files """
in_extract = False
def __init__(self, extract, wantList):
ContentHandler.__init__(self)
self.extract = extract
self.data = []
self.wantList = wantList
# next 2 strings were copy/pasted from the XML document then 'cleaned'
self.action_logAtt = 'id team1 team1_name team2 team2_name league league_id \
date matchday season season_code start1 start2'.split()
self.actionAtt = 'aid action_code activitytype result id minute second \
field_position receiver team_id x y z pace last_modified'.split()
self.DeletedactionAtt = self.actionAtt
# next string was found inspection of XML file
self.actionOpt ='subtype c1 c2 c3'.split()
self.first_Action = True
self.first_Log = True
def startElement(self, name, attrs):
self.in_extract = name in self.wantList
#print name, attrs.keys()
if self.in_extract:
if name== 'action_log':
self.data=["1\t"] #record type 1
self.data.extend(["%s\t" % attrs.getValue(att) for att in self.action_logAtt])
self.action_logID=attrs.getValue('id') # will be added at beg of each child record
elif name == 'action':
self.data=["2\t", self.action_logID + '\t'] #record type 2
self.dataOpt=['\t']*len(self.actionOpt)
for idx,attrib in enumerate(self.actionOpt):
if attrs.has_key(attrib):
self.dataOpt[idx] = attrs.getValue(attrib) + '\t'
self.data.extend(["%s\t" % attrs.getValue(att) for att in self.actionAtt])
self.data.extend(self.dataOpt )
elif name == 'Deletedaction':
self.data=["3\t", self.action_logID + '\t'] #record type 3
self.data.extend(["%s\t" % attrs.getValue(att) for att in self.DeletedactionAtt])
# Insert var names in tab delimited file
if self.first_Log:
# extract[0] will contain both var names and attributes of 1st XML element
vnames=['recType']
vnames.extend(self.action_logAtt)
vnames=["%s\t" % v for v in vnames]
vnames.append('\n')
vnames.extend(self.data)
self.data=vnames
self.first_Log = False
# extract[1] will contain both var names and attributes of 2nd XML element
elif self.first_Action and name in ['action','Deletedaction']:
vnames=['recType','logID']
vnames.extend(self.actionAtt)
vnames.extend(self.actionOpt)
vnames=["%s\t" % v for v in vnames]
vnames.append('\n')
vnames.extend(self.data) # this is an in-place modif
self.data = vnames
self.first_Action = False # we won't come back through this if
text = ''.join(self.data) + '\n'
self.extract.append(text)
def endElement(self,name):
if name in self.wantList:
self.data = []
self.in_extract = False
def characters(self,string):
if self.in_extract:
self.data.append(string)
# Use the class to create the tab separated data file
# Note: If we were dealing with a very large file, it would be preferable to create
# the 2 text files within the DataHandler class
extract = []
wantList=['action_log','action','Deletedaction'] # Elements to extract
parse(xmlFile, DataHandler(extract, wantList)) # extract now contain the data
nameroot = fname[:fname.find('.')] #sample.xml --> sample
fLogName = r'%(fpath)s/%(nameroot)sLog' % vars() # --> path/sampleLog
fActionName = r'%(fpath)s/%(nameroot)sAction' % vars() #--> path/sampleAction
fLog = open(fLogName,'w') #File to contain action_log info
fAction = open(fActionName,'w')
try:
for (i,s) in enumerate(extract):
if len(s)> 0:
s2 = s.encode('iso8859-1') #unicode must be encoded before writing
if s[0] in ['2','3'] or i==1: #2nd line contains vnames of action
fAction.write(s2)
elif s[0] == '1' or i == 0: #1st line contains vnames of log
fLog.write(s2)
finally:
fAction.close()
fLog.close()
# The 2 tab delimited text files were then read using SPSS and the syntax was
# pasted below
cmd=r"""
SET PRINTBACK=YES /MPRINT=YES.
DATASET CLOSE ALL.
GET DATA /TYPE = TXT
/FILE = "%(fLogName)s.txt"
/DELCASE = LINE
/DELIMITERS = "\t"
/ARRANGEMENT = DELIMITED
/FIRSTCASE = 2
/IMPORTCASE = ALL
/VARIABLES =
recType F1.0
id F6.0
team1 F3.0
team1_name A19
team2 F3.0
team2_name A16
league A14
league_id F1.0
date A22
matchday F2.0
season A9
season_code F2.0
start1 A22
start2 A22.
CACHE.
SAVE OUTFILE= "%(fLogName)s.sav".
GET DATA /TYPE = TXT
/FILE = "%(fActionName)s.txt"
/DELCASE = LINE
/DELIMITERS = "\t"
/ARRANGEMENT = DELIMITED
/FIRSTCASE = 2
/IMPORTCASE = ALL
/VARIABLES =
recType F1.0
logID F6.0
aid F7.0
action_code A4
activitytype F2.0
result F1.0
id F2.0
minute F2.0
second F2.0
field_position F2.0
receiver F5.0
team_id F3.0
x F5.3
y F5.3
z F5.3
pace F5.3
last_modified A22
subtype F2.0
c1 F1.0
c2 F1.0
c3 F1.0 .
CACHE.
SAVE OUTFILE="%(fActionName)s.sav".
""" % vars()
spss.Submit(cmd)