hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject [22/28] incubator-hawq git commit: HAWQ-837. Add python modules into HAWQ code
Date Tue, 21 Jun 2016 02:41:38 GMT
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/ext/yaml/scanner.py
----------------------------------------------------------------------
diff --git a/tools/bin/ext/yaml/scanner.py b/tools/bin/ext/yaml/scanner.py
new file mode 100644
index 0000000..369c027
--- /dev/null
+++ b/tools/bin/ext/yaml/scanner.py
@@ -0,0 +1,1472 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Scanner produces tokens of the following types:
+# STREAM-START
+# STREAM-END
+# DIRECTIVE(name, value)
+# DOCUMENT-START
+# DOCUMENT-END
+# BLOCK-SEQUENCE-START
+# BLOCK-MAPPING-START
+# BLOCK-END
+# FLOW-SEQUENCE-START
+# FLOW-MAPPING-START
+# FLOW-SEQUENCE-END
+# FLOW-MAPPING-END
+# BLOCK-ENTRY
+# FLOW-ENTRY
+# KEY
+# VALUE
+# ALIAS(value)
+# ANCHOR(value)
+# TAG(value)
+# SCALAR(value, plain, style)
+#
+# Read comments in the Scanner code for more details.
+#
+
+__all__ = ['Scanner', 'ScannerError']
+
+from error import MarkedYAMLError
+from tokens import *
+
+class ScannerError(MarkedYAMLError):
+    pass
+
+class SimpleKey(object):
+    # See below simple keys treatment.
+
+    def __init__(self, token_number, required, index, line, column, mark):
+        self.token_number = token_number
+        self.required = required
+        self.index = index
+        self.line = line
+        self.column = column
+        self.mark = mark
+
+class Scanner(object):
+
+    def __init__(self):
+        """Initialize the scanner."""
+        # It is assumed that Scanner and Reader will have a common descendant.
+        # Reader do the dirty work of checking for BOM and converting the
+        # input data to Unicode. It also adds NUL to the end.
+        #
+        # Reader supports the following methods
+        #   self.peek(i=0)       # peek the next i-th character
+        #   self.prefix(l=1)     # peek the next l characters
+        #   self.forward(l=1)    # read the next l characters and move the pointer.
+
+        # Had we reached the end of the stream?
+        self.done = False
+
+        # The number of unclosed '{' and '['. `flow_level == 0` means block
+        # context.
+        self.flow_level = 0
+
+        # List of processed tokens that are not yet emitted.
+        self.tokens = []
+
+        # Add the STREAM-START token.
+        self.fetch_stream_start()
+
+        # Number of tokens that were emitted through the `get_token` method.
+        self.tokens_taken = 0
+
+        # The current indentation level.
+        self.indent = -1
+
+        # Past indentation levels.
+        self.indents = []
+
+        # Variables related to simple keys treatment.
+
+        # A simple key is a key that is not denoted by the '?' indicator.
+        # Example of simple keys:
+        #   ---
+        #   block simple key: value
+        #   ? not a simple key:
+        #   : { flow simple key: value }
+        # We emit the KEY token before all keys, so when we find a potential
+        # simple key, we try to locate the corresponding ':' indicator.
+        # Simple keys should be limited to a single line and 1024 characters.
+
+        # Can a simple key start at the current position? A simple key may
+        # start:
+        # - at the beginning of the line, not counting indentation spaces
+        #       (in block context),
+        # - after '{', '[', ',' (in the flow context),
+        # - after '?', ':', '-' (in the block context).
+        # In the block context, this flag also signifies if a block collection
+        # may start at the current position.
+        self.allow_simple_key = True
+
+        # Keep track of possible simple keys. This is a dictionary. The key
+        # is `flow_level`; there can be no more that one possible simple key
+        # for each level. The value is a SimpleKey record:
+        #   (token_number, required, index, line, column, mark)
+        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
+        # '[', or '{' tokens.
+        self.possible_simple_keys = {}
+
+    # Public methods.
+
+    def check_token(self, *choices):
+        # Check if the next token is one of the given types.
+        while self.need_more_tokens():
+            self.fetch_more_tokens()
+        if self.tokens:
+            if not choices:
+                return True
+            for choice in choices:
+                if isinstance(self.tokens[0], choice):
+                    return True
+        return False
+
+    def peek_token(self):
+        # Return the next token, but do not delete if from the queue.
+        while self.need_more_tokens():
+            self.fetch_more_tokens()
+        if self.tokens:
+            return self.tokens[0]
+
+    def get_token(self):
+        # Return the next token.
+        while self.need_more_tokens():
+            self.fetch_more_tokens()
+        if self.tokens:
+            self.tokens_taken += 1
+            return self.tokens.pop(0)
+
+    # Private methods.
+
+    def need_more_tokens(self):
+        if self.done:
+            return False
+        if not self.tokens:
+            return True
+        # The current token may be a potential simple key, so we
+        # need to look further.
+        self.stale_possible_simple_keys()
+        if self.next_possible_simple_key() == self.tokens_taken:
+            return True
+
+    def fetch_more_tokens(self):
+
+        # Eat whitespaces and comments until we reach the next token.
+        self.scan_to_next_token()
+
+        # Remove obsolete possible simple keys.
+        self.stale_possible_simple_keys()
+
+        # Compare the current indentation and column. It may add some tokens
+        # and decrease the current indentation level.
+        self.unwind_indent(self.column)
+
+        # Peek the next character.
+        ch = self.peek()
+
+        # Is it the end of stream?
+        if ch == u'\0':
+            return self.fetch_stream_end()
+
+        # Is it a directive?
+        if ch == u'%' and self.check_directive():
+            return self.fetch_directive()
+
+        # Is it the document start?
+        if ch == u'-' and self.check_document_start():
+            return self.fetch_document_start()
+
+        # Is it the document end?
+        if ch == u'.' and self.check_document_end():
+            return self.fetch_document_end()
+
+        # TODO: support for BOM within a stream.
+        #if ch == u'\uFEFF':
+        #    return self.fetch_bom()    <-- issue BOMToken
+
+        # Note: the order of the following checks is NOT significant.
+
+        # Is it the flow sequence start indicator?
+        if ch == u'[':
+            return self.fetch_flow_sequence_start()
+
+        # Is it the flow mapping start indicator?
+        if ch == u'{':
+            return self.fetch_flow_mapping_start()
+
+        # Is it the flow sequence end indicator?
+        if ch == u']':
+            return self.fetch_flow_sequence_end()
+
+        # Is it the flow mapping end indicator?
+        if ch == u'}':
+            return self.fetch_flow_mapping_end()
+
+        # Is it the flow entry indicator?
+        if ch == u',':
+            return self.fetch_flow_entry()
+
+        # Is it the block entry indicator?
+        if ch == u'-' and self.check_block_entry():
+            return self.fetch_block_entry()
+
+        # Is it the key indicator?
+        if ch == u'?' and self.check_key():
+            return self.fetch_key()
+
+        # Is it the value indicator?
+        if ch == u':' and self.check_value():
+            return self.fetch_value()
+
+        # Is it an alias?
+        if ch == u'*':
+            return self.fetch_alias()
+
+        # Is it an anchor?
+        if ch == u'&':
+            return self.fetch_anchor()
+
+        # Is it a tag?
+        if ch == u'!':
+            return self.fetch_tag()
+
+        # Is it a literal scalar?
+        if ch == u'|' and not self.flow_level:
+            return self.fetch_literal()
+
+        # Is it a folded scalar?
+        if ch == u'>' and not self.flow_level:
+            return self.fetch_folded()
+
+        # Is it a single quoted scalar?
+        if ch == u'\'':
+            return self.fetch_single()
+
+        # Is it a double quoted scalar?
+        if ch == u'\"':
+            return self.fetch_double()
+
+        # It must be a plain scalar then.
+        if self.check_plain():
+            return self.fetch_plain()
+
+        # No? It's an error. Let's produce a nice error message.
+        raise ScannerError("while scanning for the next token", None,
+                "found character %r that cannot start any token"
+                % ch.encode('utf-8'), self.get_mark())
+
+    # Simple keys treatment.
+
+    def next_possible_simple_key(self):
+        # Return the number of the nearest possible simple key. Actually we
+        # don't need to loop through the whole dictionary. We may replace it
+        # with the following code:
+        #   if not self.possible_simple_keys:
+        #       return None
+        #   return self.possible_simple_keys[
+        #           min(self.possible_simple_keys.keys())].token_number
+        min_token_number = None
+        for level in self.possible_simple_keys:
+            key = self.possible_simple_keys[level]
+            if min_token_number is None or key.token_number < min_token_number:
+                min_token_number = key.token_number
+        return min_token_number
+
+    def stale_possible_simple_keys(self):
+        # Remove entries that are no longer possible simple keys. According to
+        # the YAML specification, simple keys
+        # - should be limited to a single line,
+        # - should be no longer than 1024 characters.
+        # Disabling this procedure will allow simple keys of any length and
+        # height (may cause problems if indentation is broken though).
+        for level in self.possible_simple_keys.keys():
+            key = self.possible_simple_keys[level]
+            if key.line != self.line  \
+                    or self.index-key.index > 1024:
+                if key.required:
+                    raise ScannerError("while scanning a simple key", key.mark,
+                            "could not found expected ':'", self.get_mark())
+                del self.possible_simple_keys[level]
+
+    def save_possible_simple_key(self):
+        # The next token may start a simple key. We check if it's possible
+        # and save its position. This function is called for
+        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
+
+        # Check if a simple key is required at the current position.
+        required = not self.flow_level and self.indent == self.column
+
+        # A simple key is required only if it is the first token in the current
+        # line. Therefore it is always allowed.
+        assert self.allow_simple_key or not required
+
+        # The next token might be a simple key. Let's save it's number and
+        # position.
+        if self.allow_simple_key:
+            self.remove_possible_simple_key()
+            token_number = self.tokens_taken+len(self.tokens)
+            key = SimpleKey(token_number, required,
+                    self.index, self.line, self.column, self.get_mark())
+            self.possible_simple_keys[self.flow_level] = key
+
+    def remove_possible_simple_key(self):
+        # Remove the saved possible key position at the current flow level.
+        if self.flow_level in self.possible_simple_keys:
+            key = self.possible_simple_keys[self.flow_level]
+            
+            if key.required:
+                raise ScannerError("while scanning a simple key", key.mark,
+                        "could not found expected ':'", self.get_mark())
+
+            del self.possible_simple_keys[self.flow_level]
+
+    # Indentation functions.
+
+    def unwind_indent(self, column):
+
+        ## In flow context, tokens should respect indentation.
+        ## Actually the condition should be `self.indent >= column` according to
+        ## the spec. But this condition will prohibit intuitively correct
+        ## constructions such as
+        ## key : {
+        ## }
+        #if self.flow_level and self.indent > column:
+        #    raise ScannerError(None, None,
+        #            "invalid intendation or unclosed '[' or '{'",
+        #            self.get_mark())
+
+        # In the flow context, indentation is ignored. We make the scanner less
+        # restrictive then specification requires.
+        if self.flow_level:
+            return
+
+        # In block context, we may need to issue the BLOCK-END tokens.
+        while self.indent > column:
+            mark = self.get_mark()
+            self.indent = self.indents.pop()
+            self.tokens.append(BlockEndToken(mark, mark))
+
+    def add_indent(self, column):
+        # Check if we need to increase indentation.
+        if self.indent < column:
+            self.indents.append(self.indent)
+            self.indent = column
+            return True
+        return False
+
+    # Fetchers.
+
+    def fetch_stream_start(self):
+        # We always add STREAM-START as the first token and STREAM-END as the
+        # last token.
+
+        # Read the token.
+        mark = self.get_mark()
+        
+        # Add STREAM-START.
+        self.tokens.append(StreamStartToken(mark, mark,
+            encoding=self.encoding))
+        
+
+    def fetch_stream_end(self):
+
+        # Set the current intendation to -1.
+        self.unwind_indent(-1)
+
+        # Reset everything (not really needed).
+        self.allow_simple_key = False
+        self.possible_simple_keys = {}
+
+        # Read the token.
+        mark = self.get_mark()
+        
+        # Add STREAM-END.
+        self.tokens.append(StreamEndToken(mark, mark))
+
+        # The steam is finished.
+        self.done = True
+
+    def fetch_directive(self):
+        
+        # Set the current intendation to -1.
+        self.unwind_indent(-1)
+
+        # Reset simple keys.
+        self.remove_possible_simple_key()
+        self.allow_simple_key = False
+
+        # Scan and add DIRECTIVE.
+        self.tokens.append(self.scan_directive())
+
+    def fetch_document_start(self):
+        self.fetch_document_indicator(DocumentStartToken)
+
+    def fetch_document_end(self):
+        self.fetch_document_indicator(DocumentEndToken)
+
+    def fetch_document_indicator(self, TokenClass):
+
+        # Set the current intendation to -1.
+        self.unwind_indent(-1)
+
+        # Reset simple keys. Note that there could not be a block collection
+        # after '---'.
+        self.remove_possible_simple_key()
+        self.allow_simple_key = False
+
+        # Add DOCUMENT-START or DOCUMENT-END.
+        start_mark = self.get_mark()
+        self.forward(3)
+        end_mark = self.get_mark()
+        self.tokens.append(TokenClass(start_mark, end_mark))
+
+    def fetch_flow_sequence_start(self):
+        self.fetch_flow_collection_start(FlowSequenceStartToken)
+
+    def fetch_flow_mapping_start(self):
+        self.fetch_flow_collection_start(FlowMappingStartToken)
+
+    def fetch_flow_collection_start(self, TokenClass):
+
+        # '[' and '{' may start a simple key.
+        self.save_possible_simple_key()
+
+        # Increase the flow level.
+        self.flow_level += 1
+
+        # Simple keys are allowed after '[' and '{'.
+        self.allow_simple_key = True
+
+        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(TokenClass(start_mark, end_mark))
+
+    def fetch_flow_sequence_end(self):
+        self.fetch_flow_collection_end(FlowSequenceEndToken)
+
+    def fetch_flow_mapping_end(self):
+        self.fetch_flow_collection_end(FlowMappingEndToken)
+
+    def fetch_flow_collection_end(self, TokenClass):
+
+        # Reset possible simple key on the current level.
+        self.remove_possible_simple_key()
+
+        # Decrease the flow level.
+        self.flow_level -= 1
+
+        # No simple keys after ']' or '}'.
+        self.allow_simple_key = False
+
+        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(TokenClass(start_mark, end_mark))
+
+    def fetch_flow_entry(self):
+
+        # Simple keys are allowed after ','.
+        self.allow_simple_key = True
+
+        # Reset possible simple key on the current level.
+        self.remove_possible_simple_key()
+
+        # Add FLOW-ENTRY.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(FlowEntryToken(start_mark, end_mark))
+
+    def fetch_block_entry(self):
+
+        # Block context needs additional checks.
+        if not self.flow_level:
+
+            # Are we allowed to start a new entry?
+            if not self.allow_simple_key:
+                raise ScannerError(None, None,
+                        "sequence entries are not allowed here",
+                        self.get_mark())
+
+            # We may need to add BLOCK-SEQUENCE-START.
+            if self.add_indent(self.column):
+                mark = self.get_mark()
+                self.tokens.append(BlockSequenceStartToken(mark, mark))
+
+        # It's an error for the block entry to occur in the flow context,
+        # but we let the parser detect this.
+        else:
+            pass
+
+        # Simple keys are allowed after '-'.
+        self.allow_simple_key = True
+
+        # Reset possible simple key on the current level.
+        self.remove_possible_simple_key()
+
+        # Add BLOCK-ENTRY.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(BlockEntryToken(start_mark, end_mark))
+
+    def fetch_key(self):
+        
+        # Block context needs additional checks.
+        if not self.flow_level:
+
+            # Are we allowed to start a key (not nessesary a simple)?
+            if not self.allow_simple_key:
+                raise ScannerError(None, None,
+                        "mapping keys are not allowed here",
+                        self.get_mark())
+
+            # We may need to add BLOCK-MAPPING-START.
+            if self.add_indent(self.column):
+                mark = self.get_mark()
+                self.tokens.append(BlockMappingStartToken(mark, mark))
+
+        # Simple keys are allowed after '?' in the block context.
+        self.allow_simple_key = not self.flow_level
+
+        # Reset possible simple key on the current level.
+        self.remove_possible_simple_key()
+
+        # Add KEY.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(KeyToken(start_mark, end_mark))
+
+    def fetch_value(self):
+
+        # Do we determine a simple key?
+        if self.flow_level in self.possible_simple_keys:
+
+            # Add KEY.
+            key = self.possible_simple_keys[self.flow_level]
+            del self.possible_simple_keys[self.flow_level]
+            self.tokens.insert(key.token_number-self.tokens_taken,
+                    KeyToken(key.mark, key.mark))
+
+            # If this key starts a new block mapping, we need to add
+            # BLOCK-MAPPING-START.
+            if not self.flow_level:
+                if self.add_indent(key.column):
+                    self.tokens.insert(key.token_number-self.tokens_taken,
+                            BlockMappingStartToken(key.mark, key.mark))
+
+            # There cannot be two simple keys one after another.
+            self.allow_simple_key = False
+
+        # It must be a part of a complex key.
+        else:
+            
+            # Block context needs additional checks.
+            # (Do we really need them? They will be catched by the parser
+            # anyway.)
+            if not self.flow_level:
+
+                # We are allowed to start a complex value if and only if
+                # we can start a simple key.
+                if not self.allow_simple_key:
+                    raise ScannerError(None, None,
+                            "mapping values are not allowed here",
+                            self.get_mark())
+
+            # If this value starts a new block mapping, we need to add
+            # BLOCK-MAPPING-START.  It will be detected as an error later by
+            # the parser.
+            if not self.flow_level:
+                if self.add_indent(self.column):
+                    mark = self.get_mark()
+                    self.tokens.append(BlockMappingStartToken(mark, mark))
+
+            # Simple keys are allowed after ':' in the block context.
+            self.allow_simple_key = not self.flow_level
+
+            # Reset possible simple key on the current level.
+            self.remove_possible_simple_key()
+
+        # Add VALUE.
+        start_mark = self.get_mark()
+        self.forward()
+        end_mark = self.get_mark()
+        self.tokens.append(ValueToken(start_mark, end_mark))
+
+    def fetch_alias(self):
+
+        # ALIAS could be a simple key.
+        self.save_possible_simple_key()
+
+        # No simple keys after ALIAS.
+        self.allow_simple_key = False
+
+        # Scan and add ALIAS.
+        self.tokens.append(self.scan_anchor(AliasToken))
+
+    def fetch_anchor(self):
+
+        # ANCHOR could start a simple key.
+        self.save_possible_simple_key()
+
+        # No simple keys after ANCHOR.
+        self.allow_simple_key = False
+
+        # Scan and add ANCHOR.
+        self.tokens.append(self.scan_anchor(AnchorToken))
+
+    def fetch_tag(self):
+
+        # TAG could start a simple key.
+        self.save_possible_simple_key()
+
+        # No simple keys after TAG.
+        self.allow_simple_key = False
+
+        # Scan and add TAG.
+        self.tokens.append(self.scan_tag())
+
+    def fetch_literal(self):
+        self.fetch_block_scalar(style='|')
+
+    def fetch_folded(self):
+        self.fetch_block_scalar(style='>')
+
+    def fetch_block_scalar(self, style):
+
+        # A simple key may follow a block scalar.
+        self.allow_simple_key = True
+
+        # Reset possible simple key on the current level.
+        self.remove_possible_simple_key()
+
+        # Scan and add SCALAR.
+        self.tokens.append(self.scan_block_scalar(style))
+
+    def fetch_single(self):
+        self.fetch_flow_scalar(style='\'')
+
+    def fetch_double(self):
+        self.fetch_flow_scalar(style='"')
+
+    def fetch_flow_scalar(self, style):
+
+        # A flow scalar could be a simple key.
+        self.save_possible_simple_key()
+
+        # No simple keys after flow scalars.
+        self.allow_simple_key = False
+
+        # Scan and add SCALAR.
+        self.tokens.append(self.scan_flow_scalar(style))
+
+    def fetch_plain(self):
+
+        # A plain scalar could be a simple key.
+        self.save_possible_simple_key()
+
+        # No simple keys after plain scalars. But note that `scan_plain` will
+        # change this flag if the scan is finished at the beginning of the
+        # line.
+        self.allow_simple_key = False
+
+        # Scan and add SCALAR. May change `allow_simple_key`.
+        self.tokens.append(self.scan_plain())
+
+    # Checkers.
+
+    def check_directive(self):
+
+        # DIRECTIVE:        ^ '%' ...
+        # The '%' indicator is already checked.
+        if self.column == 0:
+            return True
+
+    def check_document_start(self):
+
+        # DOCUMENT-START:   ^ '---' (' '|'\n')
+        if self.column == 0:
+            if self.prefix(3) == u'---'  \
+                    and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+                return True
+
+    def check_document_end(self):
+
+        # DOCUMENT-END:     ^ '...' (' '|'\n')
+        if self.column == 0:
+            if self.prefix(3) == u'...'  \
+                    and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+                return True
+
+    def check_block_entry(self):
+
+        # BLOCK-ENTRY:      '-' (' '|'\n')
+        return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+    def check_key(self):
+
+        # KEY(flow context):    '?'
+        if self.flow_level:
+            return True
+
+        # KEY(block context):   '?' (' '|'\n')
+        else:
+            return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+    def check_value(self):
+
+        # VALUE(flow context):  ':'
+        if self.flow_level:
+            return True
+
+        # VALUE(block context): ':' (' '|'\n')
+        else:
+            return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+    def check_plain(self):
+
+        # A plain scalar may start with any non-space character except:
+        #   '-', '?', ':', ',', '[', ']', '{', '}',
+        #   '#', '&', '*', '!', '|', '>', '\'', '\"',
+        #   '%', '@', '`'.
+        #
+        # It may also start with
+        #   '-', '?', ':'
+        # if it is followed by a non-space character.
+        #
+        # Note that we limit the last rule to the block context (except the
+        # '-' character) because we want the flow context to be space
+        # independent.
+        ch = self.peek()
+        return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
+                or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
+                        and (ch == u'-' or (not self.flow_level and ch in u'?:')))
+
+    # Scanners.
+
+    def scan_to_next_token(self):
+        # We ignore spaces, line breaks and comments.
+        # If we find a line break in the block context, we set the flag
+        # `allow_simple_key` on.
+        # The byte order mark is stripped if it's the first character in the
+        # stream. We do not yet support BOM inside the stream as the
+        # specification requires. Any such mark will be considered as a part
+        # of the document.
+        #
+        # TODO: We need to make tab handling rules more sane. A good rule is
+        #   Tabs cannot precede tokens
+        #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
+        #   KEY(block), VALUE(block), BLOCK-ENTRY
+        # So the checking code is
+        #   if <TAB>:
+        #       self.allow_simple_keys = False
+        # We also need to add the check for `allow_simple_keys == True` to
+        # `unwind_indent` before issuing BLOCK-END.
+        # Scanners for block, flow, and plain scalars need to be modified.
+
+        if self.index == 0 and self.peek() == u'\uFEFF':
+            self.forward()
+        found = False
+        while not found:
+            while self.peek() == u' ':
+                self.forward()
+            if self.peek() == u'#':
+                while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+                    self.forward()
+            if self.scan_line_break():
+                if not self.flow_level:
+                    self.allow_simple_key = True
+            else:
+                found = True
+
+    def scan_directive(self):
+        # See the specification for details.
+        start_mark = self.get_mark()
+        self.forward()
+        name = self.scan_directive_name(start_mark)
+        value = None
+        if name == u'YAML':
+            value = self.scan_yaml_directive_value(start_mark)
+            end_mark = self.get_mark()
+        elif name == u'TAG':
+            value = self.scan_tag_directive_value(start_mark)
+            end_mark = self.get_mark()
+        else:
+            end_mark = self.get_mark()
+            while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+                self.forward()
+        self.scan_directive_ignored_line(start_mark)
+        return DirectiveToken(name, value, start_mark, end_mark)
+
+    def scan_directive_name(self, start_mark):
+        # See the specification for details.
+        length = 0
+        ch = self.peek(length)
+        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
+                or ch in u'-_':
+            length += 1
+            ch = self.peek(length)
+        if not length:
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected alphabetic or numeric character, but found %r"
+                    % ch.encode('utf-8'), self.get_mark())
+        value = self.prefix(length)
+        self.forward(length)
+        ch = self.peek()
+        if ch not in u'\0 \r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected alphabetic or numeric character, but found %r"
+                    % ch.encode('utf-8'), self.get_mark())
+        return value
+
+    def scan_yaml_directive_value(self, start_mark):
+        # See the specification for details.
+        while self.peek() == u' ':
+            self.forward()
+        major = self.scan_yaml_directive_number(start_mark)
+        if self.peek() != '.':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected a digit or '.', but found %r"
+                    % self.peek().encode('utf-8'),
+                    self.get_mark())
+        self.forward()
+        minor = self.scan_yaml_directive_number(start_mark)
+        if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected a digit or ' ', but found %r"
+                    % self.peek().encode('utf-8'),
+                    self.get_mark())
+        return (major, minor)
+
+    def scan_yaml_directive_number(self, start_mark):
+        # See the specification for details.
+        ch = self.peek()
+        if not (u'0' <= ch <= '9'):
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected a digit, but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        length = 0
+        while u'0' <= self.peek(length) <= u'9':
+            length += 1
+        value = int(self.prefix(length))
+        self.forward(length)
+        return value
+
+    def scan_tag_directive_value(self, start_mark):
+        # See the specification for details.
+        while self.peek() == u' ':
+            self.forward()
+        handle = self.scan_tag_directive_handle(start_mark)
+        while self.peek() == u' ':
+            self.forward()
+        prefix = self.scan_tag_directive_prefix(start_mark)
+        return (handle, prefix)
+
+    def scan_tag_directive_handle(self, start_mark):
+        # See the specification for details.
+        value = self.scan_tag_handle('directive', start_mark)
+        ch = self.peek()
+        if ch != u' ':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected ' ', but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        return value
+
+    def scan_tag_directive_prefix(self, start_mark):
+        # See the specification for details.
+        value = self.scan_tag_uri('directive', start_mark)
+        ch = self.peek()
+        if ch not in u'\0 \r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected ' ', but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        return value
+
+    def scan_directive_ignored_line(self, start_mark):
+        # See the specification for details.
+        while self.peek() == u' ':
+            self.forward()
+        if self.peek() == u'#':
+            while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+                self.forward()
+        ch = self.peek()
+        if ch not in u'\0\r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a directive", start_mark,
+                    "expected a comment or a line break, but found %r"
+                        % ch.encode('utf-8'), self.get_mark())
+        self.scan_line_break()
+
+    def scan_anchor(self, TokenClass):
+        # The specification does not restrict characters for anchors and
+        # aliases. This may lead to problems, for instance, the document:
+        #   [ *alias, value ]
+        # can be interpteted in two ways, as
+        #   [ "value" ]
+        # and
+        #   [ *alias , "value" ]
+        # Therefore we restrict aliases to numbers and ASCII letters.
+        start_mark = self.get_mark()
+        indicator = self.peek()
+        if indicator == '*':
+            name = 'alias'
+        else:
+            name = 'anchor'
+        self.forward()
+        length = 0
+        ch = self.peek(length)
+        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
+                or ch in u'-_':
+            length += 1
+            ch = self.peek(length)
+        if not length:
+            raise ScannerError("while scanning an %s" % name, start_mark,
+                    "expected alphabetic or numeric character, but found %r"
+                    % ch.encode('utf-8'), self.get_mark())
+        value = self.prefix(length)
+        self.forward(length)
+        ch = self.peek()
+        if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
+            raise ScannerError("while scanning an %s" % name, start_mark,
+                    "expected alphabetic or numeric character, but found %r"
+                    % ch.encode('utf-8'), self.get_mark())
+        end_mark = self.get_mark()
+        return TokenClass(value, start_mark, end_mark)
+
+    def scan_tag(self):
+        # See the specification for details.
+        start_mark = self.get_mark()
+        ch = self.peek(1)
+        if ch == u'<':
+            handle = None
+            self.forward(2)
+            suffix = self.scan_tag_uri('tag', start_mark)
+            if self.peek() != u'>':
+                raise ScannerError("while parsing a tag", start_mark,
+                        "expected '>', but found %r" % self.peek().encode('utf-8'),
+                        self.get_mark())
+            self.forward()
+        elif ch in u'\0 \t\r\n\x85\u2028\u2029':
+            handle = None
+            suffix = u'!'
+            self.forward()
+        else:
+            length = 1
+            use_handle = False
+            while ch not in u'\0 \r\n\x85\u2028\u2029':
+                if ch == u'!':
+                    use_handle = True
+                    break
+                length += 1
+                ch = self.peek(length)
+            handle = u'!'
+            if use_handle:
+                handle = self.scan_tag_handle('tag', start_mark)
+            else:
+                handle = u'!'
+                self.forward()
+            suffix = self.scan_tag_uri('tag', start_mark)
+        ch = self.peek()
+        if ch not in u'\0 \r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a tag", start_mark,
+                    "expected ' ', but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        value = (handle, suffix)
+        end_mark = self.get_mark()
+        return TagToken(value, start_mark, end_mark)
+
+    def scan_block_scalar(self, style):
+        # See the specification for details.
+
+        if style == '>':
+            folded = True
+        else:
+            folded = False
+
+        chunks = []
+        start_mark = self.get_mark()
+
+        # Scan the header.
+        self.forward()
+        chomping, increment = self.scan_block_scalar_indicators(start_mark)
+        self.scan_block_scalar_ignored_line(start_mark)
+
+        # Determine the indentation level and go to the first non-empty line.
+        min_indent = self.indent+1
+        if min_indent < 1:
+            min_indent = 1
+        if increment is None:
+            breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
+            indent = max(min_indent, max_indent)
+        else:
+            indent = min_indent+increment-1
+            breaks, end_mark = self.scan_block_scalar_breaks(indent)
+        line_break = u''
+
+        # Scan the inner part of the block scalar.
+        while self.column == indent and self.peek() != u'\0':
+            chunks.extend(breaks)
+            leading_non_space = self.peek() not in u' \t'
+            length = 0
+            while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
+                length += 1
+            chunks.append(self.prefix(length))
+            self.forward(length)
+            line_break = self.scan_line_break()
+            breaks, end_mark = self.scan_block_scalar_breaks(indent)
+            if self.column == indent and self.peek() != u'\0':
+
+                # Unfortunately, folding rules are ambiguous.
+                #
+                # This is the folding according to the specification:
+                
+                if folded and line_break == u'\n'   \
+                        and leading_non_space and self.peek() not in u' \t':
+                    if not breaks:
+                        chunks.append(u' ')
+                else:
+                    chunks.append(line_break)
+                
+                # This is Clark Evans's interpretation (also in the spec
+                # examples):
+                #
+                #if folded and line_break == u'\n':
+                #    if not breaks:
+                #        if self.peek() not in ' \t':
+                #            chunks.append(u' ')
+                #        else:
+                #            chunks.append(line_break)
+                #else:
+                #    chunks.append(line_break)
+            else:
+                break
+
+        # Chomp the tail.
+        if chomping is not False:
+            chunks.append(line_break)
+        if chomping is True:
+            chunks.extend(breaks)
+
+        # We are done.
+        return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
+                style)
+
+    def scan_block_scalar_indicators(self, start_mark):
+        # See the specification for details.
+        chomping = None
+        increment = None
+        ch = self.peek()
+        if ch in u'+-':
+            if ch == '+':
+                chomping = True
+            else:
+                chomping = False
+            self.forward()
+            ch = self.peek()
+            if ch in u'0123456789':
+                increment = int(ch)
+                if increment == 0:
+                    raise ScannerError("while scanning a block scalar", start_mark,
+                            "expected indentation indicator in the range 1-9, but found 0",
+                            self.get_mark())
+                self.forward()
+        elif ch in u'0123456789':
+            increment = int(ch)
+            if increment == 0:
+                raise ScannerError("while scanning a block scalar", start_mark,
+                        "expected indentation indicator in the range 1-9, but found 0",
+                        self.get_mark())
+            self.forward()
+            ch = self.peek()
+            if ch in u'+-':
+                if ch == '+':
+                    chomping = True
+                else:
+                    chomping = False
+                self.forward()
+        ch = self.peek()
+        if ch not in u'\0 \r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a block scalar", start_mark,
+                    "expected chomping or indentation indicators, but found %r"
+                        % ch.encode('utf-8'), self.get_mark())
+        return chomping, increment
+
+    def scan_block_scalar_ignored_line(self, start_mark):
+        # See the specification for details.
+        while self.peek() == u' ':
+            self.forward()
+        if self.peek() == u'#':
+            while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+                self.forward()
+        ch = self.peek()
+        if ch not in u'\0\r\n\x85\u2028\u2029':
+            raise ScannerError("while scanning a block scalar", start_mark,
+                    "expected a comment or a line break, but found %r"
+                        % ch.encode('utf-8'), self.get_mark())
+        self.scan_line_break()
+
+    def scan_block_scalar_indentation(self):
+        # See the specification for details.
+        chunks = []
+        max_indent = 0
+        end_mark = self.get_mark()
+        while self.peek() in u' \r\n\x85\u2028\u2029':
+            if self.peek() != u' ':
+                chunks.append(self.scan_line_break())
+                end_mark = self.get_mark()
+            else:
+                self.forward()
+                if self.column > max_indent:
+                    max_indent = self.column
+        return chunks, max_indent, end_mark
+
+    def scan_block_scalar_breaks(self, indent):
+        # See the specification for details.
+        chunks = []
+        end_mark = self.get_mark()
+        while self.column < indent and self.peek() == u' ':
+            self.forward()
+        while self.peek() in u'\r\n\x85\u2028\u2029':
+            chunks.append(self.scan_line_break())
+            end_mark = self.get_mark()
+            while self.column < indent and self.peek() == u' ':
+                self.forward()
+        return chunks, end_mark
+
+    def scan_flow_scalar(self, style):
+        # See the specification for details.
+        # Note that we loose indentation rules for quoted scalars. Quoted
+        # scalars don't need to adhere indentation because " and ' clearly
+        # mark the beginning and the end of them. Therefore we are less
+        # restrictive then the specification requires. We only need to check
+        # that document separators are not included in scalars.
+        if style == '"':
+            double = True
+        else:
+            double = False
+        chunks = []
+        start_mark = self.get_mark()
+        quote = self.peek()
+        self.forward()
+        chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
+        while self.peek() != quote:
+            chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
+            chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
+        self.forward()
+        end_mark = self.get_mark()
+        return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
+                style)
+
+    ESCAPE_REPLACEMENTS = {
+        u'0':   u'\0',
+        u'a':   u'\x07',
+        u'b':   u'\x08',
+        u't':   u'\x09',
+        u'\t':  u'\x09',
+        u'n':   u'\x0A',
+        u'v':   u'\x0B',
+        u'f':   u'\x0C',
+        u'r':   u'\x0D',
+        u'e':   u'\x1B',
+        u' ':   u'\x20',
+        u'\"':  u'\"',
+        u'\\':  u'\\',
+        u'N':   u'\x85',
+        u'_':   u'\xA0',
+        u'L':   u'\u2028',
+        u'P':   u'\u2029',
+    }
+
+    ESCAPE_CODES = {
+        u'x':   2,
+        u'u':   4,
+        u'U':   8,
+    }
+
+    def scan_flow_scalar_non_spaces(self, double, start_mark):
+        # See the specification for details.
+        chunks = []
+        while True:
+            length = 0
+            while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
+                length += 1
+            if length:
+                chunks.append(self.prefix(length))
+                self.forward(length)
+            ch = self.peek()
+            if not double and ch == u'\'' and self.peek(1) == u'\'':
+                chunks.append(u'\'')
+                self.forward(2)
+            elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
+                chunks.append(ch)
+                self.forward()
+            elif double and ch == u'\\':
+                self.forward()
+                ch = self.peek()
+                if ch in self.ESCAPE_REPLACEMENTS:
+                    chunks.append(self.ESCAPE_REPLACEMENTS[ch])
+                    self.forward()
+                elif ch in self.ESCAPE_CODES:
+                    length = self.ESCAPE_CODES[ch]
+                    self.forward()
+                    for k in range(length):
+                        if self.peek(k) not in u'0123456789ABCDEFabcdef':
+                            raise ScannerError("while scanning a double-quoted scalar", start_mark,
+                                    "expected escape sequence of %d hexdecimal numbers, but found %r" %
+                                        (length, self.peek(k).encode('utf-8')), self.get_mark())
+                    code = int(self.prefix(length), 16)
+                    chunks.append(unichr(code))
+                    self.forward(length)
+                elif ch in u'\r\n\x85\u2028\u2029':
+                    self.scan_line_break()
+                    chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
+                else:
+                    raise ScannerError("while scanning a double-quoted scalar", start_mark,
+                            "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
+            else:
+                return chunks
+
+    def scan_flow_scalar_spaces(self, double, start_mark):
+        # See the specification for details.
+        chunks = []
+        length = 0
+        while self.peek(length) in u' \t':
+            length += 1
+        whitespaces = self.prefix(length)
+        self.forward(length)
+        ch = self.peek()
+        if ch == u'\0':
+            raise ScannerError("while scanning a quoted scalar", start_mark,
+                    "found unexpected end of stream", self.get_mark())
+        elif ch in u'\r\n\x85\u2028\u2029':
+            line_break = self.scan_line_break()
+            breaks = self.scan_flow_scalar_breaks(double, start_mark)
+            if line_break != u'\n':
+                chunks.append(line_break)
+            elif not breaks:
+                chunks.append(u' ')
+            chunks.extend(breaks)
+        else:
+            chunks.append(whitespaces)
+        return chunks
+
+    def scan_flow_scalar_breaks(self, double, start_mark):
+        # See the specification for details.
+        chunks = []
+        while True:
+            # Instead of checking indentation, we check for document
+            # separators.
+            prefix = self.prefix(3)
+            if (prefix == u'---' or prefix == u'...')   \
+                    and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+                raise ScannerError("while scanning a quoted scalar", start_mark,
+                        "found unexpected document separator", self.get_mark())
+            while self.peek() in u' \t':
+                self.forward()
+            if self.peek() in u'\r\n\x85\u2028\u2029':
+                chunks.append(self.scan_line_break())
+            else:
+                return chunks
+
+    def scan_plain(self):
+        # See the specification for details.
+        # We add an additional restriction for the flow context:
+        #   plain scalars in the flow context cannot contain ',', ':' and '?'.
+        # We also keep track of the `allow_simple_key` flag here.
+        # Indentation rules are loosed for the flow context.
+        chunks = []
+        start_mark = self.get_mark()
+        end_mark = start_mark
+        indent = self.indent+1
+        # We allow zero indentation for scalars, but then we need to check for
+        # document separators at the beginning of the line.
+        #if indent == 0:
+        #    indent = 1
+        spaces = []
+        while True:
+            length = 0
+            if self.peek() == u'#':
+                break
+            while True:
+                ch = self.peek(length)
+                if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
+                        or (not self.flow_level and ch == u':' and
+                                self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
+                        or (self.flow_level and ch in u',:?[]{}'):
+                    break
+                length += 1
+            # It's not clear what we should do with ':' in the flow context.
+            if (self.flow_level and ch == u':'
+                    and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
+                self.forward(length)
+                raise ScannerError("while scanning a plain scalar", start_mark,
+                    "found unexpected ':'", self.get_mark(),
+                    "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
+            if length == 0:
+                break
+            self.allow_simple_key = False
+            chunks.extend(spaces)
+            chunks.append(self.prefix(length))
+            self.forward(length)
+            end_mark = self.get_mark()
+            spaces = self.scan_plain_spaces(indent, start_mark)
+            if not spaces or self.peek() == u'#' \
+                    or (not self.flow_level and self.column < indent):
+                break
+        return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
+
+    def scan_plain_spaces(self, indent, start_mark):
+        # See the specification for details.
+        # The specification is really confusing about tabs in plain scalars.
+        # We just forbid them completely. Do not use tabs in YAML!
+        chunks = []
+        length = 0
+        while self.peek(length) in u' ':
+            length += 1
+        whitespaces = self.prefix(length)
+        self.forward(length)
+        ch = self.peek()
+        if ch in u'\r\n\x85\u2028\u2029':
+            line_break = self.scan_line_break()
+            self.allow_simple_key = True
+            prefix = self.prefix(3)
+            if (prefix == u'---' or prefix == u'...')   \
+                    and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+                return
+            breaks = []
+            while self.peek() in u' \r\n\x85\u2028\u2029':
+                if self.peek() == ' ':
+                    self.forward()
+                else:
+                    breaks.append(self.scan_line_break())
+                    prefix = self.prefix(3)
+                    if (prefix == u'---' or prefix == u'...')   \
+                            and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+                        return
+            if line_break != u'\n':
+                chunks.append(line_break)
+            elif not breaks:
+                chunks.append(u' ')
+            chunks.extend(breaks)
+        elif whitespaces:
+            chunks.append(whitespaces)
+        return chunks
+
+    def scan_tag_handle(self, name, start_mark):
+        # See the specification for details.
+        # For some strange reasons, the specification does not allow '_' in
+        # tag handles. I have allowed it anyway.
+        ch = self.peek()
+        if ch != u'!':
+            raise ScannerError("while scanning a %s" % name, start_mark,
+                    "expected '!', but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        length = 1
+        ch = self.peek(length)
+        if ch != u' ':
+            while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
+                    or ch in u'-_':
+                length += 1
+                ch = self.peek(length)
+            if ch != u'!':
+                self.forward(length)
+                raise ScannerError("while scanning a %s" % name, start_mark,
+                        "expected '!', but found %r" % ch.encode('utf-8'),
+                        self.get_mark())
+            length += 1
+        value = self.prefix(length)
+        self.forward(length)
+        return value
+
+    def scan_tag_uri(self, name, start_mark):
+        # See the specification for details.
+        # Note: we do not check if URI is well-formed.
+        chunks = []
+        length = 0
+        ch = self.peek(length)
+        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
+                or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
+            if ch == u'%':
+                chunks.append(self.prefix(length))
+                self.forward(length)
+                length = 0
+                chunks.append(self.scan_uri_escapes(name, start_mark))
+            else:
+                length += 1
+            ch = self.peek(length)
+        if length:
+            chunks.append(self.prefix(length))
+            self.forward(length)
+            length = 0
+        if not chunks:
+            raise ScannerError("while parsing a %s" % name, start_mark,
+                    "expected URI, but found %r" % ch.encode('utf-8'),
+                    self.get_mark())
+        return u''.join(chunks)
+
+    def scan_uri_escapes(self, name, start_mark):
+        # See the specification for details.
+        bytes = []
+        mark = self.get_mark()
+        while self.peek() == u'%':
+            self.forward()
+            for k in range(2):
+                if self.peek(k) not in u'0123456789ABCDEFabcdef':
+                    raise ScannerError("while scanning a %s" % name, start_mark,
+                            "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
+                                (self.peek(k).encode('utf-8')), self.get_mark())
+            bytes.append(chr(int(self.prefix(2), 16)))
+            self.forward(2)
+        try:
+            value = unicode(''.join(bytes), 'utf-8')
+        except UnicodeDecodeError, exc:
+            raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
+        return value
+
+    def scan_line_break(self):
+        # Transforms:
+        #   '\r\n'      :   '\n'
+        #   '\r'        :   '\n'
+        #   '\n'        :   '\n'
+        #   '\x85'      :   '\n'
+        #   '\u2028'    :   '\u2028'
+        #   '\u2029     :   '\u2029'
+        #   default     :   ''
+        ch = self.peek()
+        if ch in u'\r\n\x85':
+            if self.prefix(2) == u'\r\n':
+                self.forward(2)
+            else:
+                self.forward()
+            return u'\n'
+        elif ch in u'\u2028\u2029':
+            self.forward()
+            return ch
+        return u''
+
+#try:
+#    import psyco
+#    psyco.bind(Scanner)
+#except ImportError:
+#    pass
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/ext/yaml/serializer.py
----------------------------------------------------------------------
diff --git a/tools/bin/ext/yaml/serializer.py b/tools/bin/ext/yaml/serializer.py
new file mode 100644
index 0000000..eb70372
--- /dev/null
+++ b/tools/bin/ext/yaml/serializer.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+__all__ = ['Serializer', 'SerializerError']
+
+from error import YAMLError
+from events import *
+from nodes import *
+
+class SerializerError(YAMLError):
+    pass
+
+class Serializer(object):
+
+    ANCHOR_TEMPLATE = u'id%03d'
+
+    def __init__(self, encoding=None,
+            explicit_start=None, explicit_end=None, version=None, tags=None):
+        self.use_encoding = encoding
+        self.use_explicit_start = explicit_start
+        self.use_explicit_end = explicit_end
+        self.use_version = version
+        self.use_tags = tags
+        self.serialized_nodes = {}
+        self.anchors = {}
+        self.last_anchor_id = 0
+        self.closed = None
+
+    def open(self):
+        if self.closed is None:
+            self.emit(StreamStartEvent(encoding=self.use_encoding))
+            self.closed = False
+        elif self.closed:
+            raise SerializerError("serializer is closed")
+        else:
+            raise SerializerError("serializer is already opened")
+
+    def close(self):
+        if self.closed is None:
+            raise SerializerError("serializer is not opened")
+        elif not self.closed:
+            self.emit(StreamEndEvent())
+            self.closed = True
+
+    #def __del__(self):
+    #    self.close()
+
+    def serialize(self, node):
+        if self.closed is None:
+            raise SerializerError("serializer is not opened")
+        elif self.closed:
+            raise SerializerError("serializer is closed")
+        self.emit(DocumentStartEvent(explicit=self.use_explicit_start,
+            version=self.use_version, tags=self.use_tags))
+        self.anchor_node(node)
+        self.serialize_node(node, None, None)
+        self.emit(DocumentEndEvent(explicit=self.use_explicit_end))
+        self.serialized_nodes = {}
+        self.anchors = {}
+        self.last_alias_id = 0
+
+    def anchor_node(self, node):
+        if node in self.anchors:
+            if self.anchors[node] is None:
+                self.anchors[node] = self.generate_anchor(node)
+        else:
+            self.anchors[node] = None
+            if isinstance(node, SequenceNode):
+                for item in node.value:
+                    self.anchor_node(item)
+            elif isinstance(node, MappingNode):
+                for key, value in node.value:
+                    self.anchor_node(key)
+                    self.anchor_node(value)
+
+    def generate_anchor(self, node):
+        self.last_anchor_id += 1
+        return self.ANCHOR_TEMPLATE % self.last_anchor_id
+
+    def serialize_node(self, node, parent, index):
+        alias = self.anchors[node]
+        if node in self.serialized_nodes:
+            self.emit(AliasEvent(alias))
+        else:
+            self.serialized_nodes[node] = True
+            self.descend_resolver(parent, index)
+            if isinstance(node, ScalarNode):
+                detected_tag = self.resolve(ScalarNode, node.value, (True, False))
+                default_tag = self.resolve(ScalarNode, node.value, (False, True))
+                implicit = (node.tag == detected_tag), (node.tag == default_tag)
+                self.emit(ScalarEvent(alias, node.tag, implicit, node.value,
+                    style=node.style))
+            elif isinstance(node, SequenceNode):
+                implicit = (node.tag
+                            == self.resolve(SequenceNode, node.value, True))
+                self.emit(SequenceStartEvent(alias, node.tag, implicit,
+                    flow_style=node.flow_style))
+                index = 0
+                for item in node.value:
+                    self.serialize_node(item, node, index)
+                    index += 1
+                self.emit(SequenceEndEvent())
+            elif isinstance(node, MappingNode):
+                implicit = (node.tag
+                            == self.resolve(MappingNode, node.value, True))
+                self.emit(MappingStartEvent(alias, node.tag, implicit,
+                    flow_style=node.flow_style))
+                for key, value in node.value:
+                    self.serialize_node(key, node, None)
+                    self.serialize_node(value, node, key)
+                self.emit(MappingEndEvent())
+            self.ascend_resolver()
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/ext/yaml/tokens.py
----------------------------------------------------------------------
diff --git a/tools/bin/ext/yaml/tokens.py b/tools/bin/ext/yaml/tokens.py
new file mode 100644
index 0000000..41ee0fb
--- /dev/null
+++ b/tools/bin/ext/yaml/tokens.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Token(object):
+    def __init__(self, start_mark, end_mark):
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+    def __repr__(self):
+        attributes = [key for key in self.__dict__
+                if not key.endswith('_mark')]
+        attributes.sort()
+        arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
+                for key in attributes])
+        return '%s(%s)' % (self.__class__.__name__, arguments)
+
+#class BOMToken(Token):
+#    id = '<byte order mark>'
+
+class DirectiveToken(Token):
+    id = '<directive>'
+    def __init__(self, name, value, start_mark, end_mark):
+        self.name = name
+        self.value = value
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+
+class DocumentStartToken(Token):
+    id = '<document start>'
+
+class DocumentEndToken(Token):
+    id = '<document end>'
+
+class StreamStartToken(Token):
+    id = '<stream start>'
+    def __init__(self, start_mark=None, end_mark=None,
+            encoding=None):
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+        self.encoding = encoding
+
+class StreamEndToken(Token):
+    id = '<stream end>'
+
+class BlockSequenceStartToken(Token):
+    id = '<block sequence start>'
+
+class BlockMappingStartToken(Token):
+    id = '<block mapping start>'
+
+class BlockEndToken(Token):
+    id = '<block end>'
+
+class FlowSequenceStartToken(Token):
+    id = '['
+
+class FlowMappingStartToken(Token):
+    id = '{'
+
+class FlowSequenceEndToken(Token):
+    id = ']'
+
+class FlowMappingEndToken(Token):
+    id = '}'
+
+class KeyToken(Token):
+    id = '?'
+
+class ValueToken(Token):
+    id = ':'
+
+class BlockEntryToken(Token):
+    id = '-'
+
+class FlowEntryToken(Token):
+    id = ','
+
+class AliasToken(Token):
+    id = '<alias>'
+    def __init__(self, value, start_mark, end_mark):
+        self.value = value
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+
+class AnchorToken(Token):
+    id = '<anchor>'
+    def __init__(self, value, start_mark, end_mark):
+        self.value = value
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+
+class TagToken(Token):
+    id = '<tag>'
+    def __init__(self, value, start_mark, end_mark):
+        self.value = value
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+
+class ScalarToken(Token):
+    id = '<scalar>'
+    def __init__(self, value, plain, start_mark, end_mark, style=None):
+        self.value = value
+        self.plain = plain
+        self.start_mark = start_mark
+        self.end_mark = end_mark
+        self.style = style
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/DESIGN
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/DESIGN b/tools/bin/pythonSrc/PSI-0.3b2_gp/DESIGN
new file mode 100644
index 0000000..7ee2628
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/DESIGN
@@ -0,0 +1,95 @@
+Design of PSI
+=============
+
+
+The C code of psi is split in two: (i) The Python modules and classes
+in src/ and (ii) the architecture or system dependent implementations
+that gather the required information in src/arch/.  These two parts
+communicate all the required information to each other using
+structures, i.e. each class will make a call to an arch_*() function
+that will return a new structure with all the required information in
+for that class.
+
+The reason for doing this is to shield the system implementations
+from any Python housekeeping like reference counting.  It is hoped
+that this greatly simplifies creating implementations (porting) and
+reduces the risk of hidden memory leaks etc.  There is one exception
+on this: error handling.  Error handling is done exactly as in Python,
+i.e. you set an exception using PyErr_*(PyExc_*, ...) and then return
+NULL or -1 as appropriate.  Therefore whenever a function returns NULL
+or -1 you can be assured an exception is set and you need to bail out
+too.
+
+An implication of using structures to communicate between different
+parts of the application is that the system implementations will have
+to allocate some things like strings etc.  For this reason there are
+utility functions that should always be used for memory management
+purposes: psi_malloc(), psi_calloc(), psi_realloc() and psi_free(),
+all defined in psi.h.  There are also more convenience functions
+available in psi.h, you should definitely read this file.
+
+Lastly the contents of the structures is important.  They are defined
+in the relevant header files and should have types who's size can be
+determined at compilation time to ensure that the classes and modules
+will be able to convert them to the appropriate Python types (and if
+not the compiler should complain).
+
+
+Exceptions
+----------
+
+At Process() initialisation time only two exceptions should be raised:
+(i) NoSuchProcessError if the process does not exist, (ii) OSError if
+something went wrong.  If a psi_*() function returns a negative number
+an OSError has been raised and you can use the errno attribute of the
+exception if present to fine-tune behaviour based on the context.  Some
+functions might try and help you with this fine-tuning by returning a
+specific negative value (other then -1) which indicates a certain
+class of errors (cf. psi_read_file() and psi_readlink()).
+
+XXX Flesh this out a bit and integrate it better where it belongs
+    instead of just being a random paragraph.
+
+
+Modules, APIs and Platforms
+---------------------------
+
+PSI supports many platforms and has several modules.  While we try to
+keep all APIs of all modules stable and fully implemented on all
+platforms this is not always possible.  But each module must compile
+on all platforms, this is easier than it sounds: chances are you will
+manage even if you don't have each platform available.  Since the
+platform depended implementation is always done via some psi_arch_*()
+functions (or sometimes just arch_*()) it is quite trivial to write
+stubs that will raise a NotImplementedError for all platforms.
+
+This technique also simplifies the source declaration in setup.py.  By
+having each feature in <plat>_<feature>.c files
+(e.g. ``linux_process.c``) the module declaration in setup.py can just
+do ``'src/arch/%s_foo.c' % PLATFORM`` for the "foo" feature.  If two
+platforms can share an implementation for "foo" it is then easiest for
+the <arch>_foo.c files to be skeletons using functions from the common
+implementation, e.g.::
+
+  foo_sources = ['src/util.c',
+                 'src/foomodule.c',
+                 'src/arch/%s_foo.c' % PLATFORM]
+  if PLATFORM in ['linux', 'darwin']:
+      foo_sources.append('src/arch/shared_foo_impl.c')
+
+
+C Coding Standard
+-----------------
+
+* PEP7
+
+* Functions follow the Python convention of returning -1 or NULL in
+  case of an error, in case of an error a Python exception is set.
+  Testing if an `int' function was successful is best done as `if
+  (some_function() < 0)' since some functions might add specific
+  meanings to errors by returning a value smaller then -1
+  (e.g. psi_read_file()).
+
+* Always include <Python.h> first.  Even if nothing of python is used,
+  it defines _POSIX_C_SOURCE and _XOPEN_SOURCE for us which are
+  required for POSIX compliance.

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/GREENPLUM_README
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/GREENPLUM_README b/tools/bin/pythonSrc/PSI-0.3b2_gp/GREENPLUM_README
new file mode 100644
index 0000000..2fe4880
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/GREENPLUM_README
@@ -0,0 +1 @@
+patched with this fix: http://bitbucket.org/chrismiles/psi/changeset/bf3e487a5107/

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/LICENSE
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/LICENSE b/tools/bin/pythonSrc/PSI-0.3b2_gp/LICENSE
new file mode 100644
index 0000000..6ffa900
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/LICENSE
@@ -0,0 +1,26 @@
+The MIT License
+
+Copyright (C) 2007 Chris Miles
+
+Copyright (C) 2008-2009 Floris Bruynooghe
+
+Copyright (C) 2008-2009 Abilisoft Ltd.
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/MANIFEST
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/MANIFEST b/tools/bin/pythonSrc/PSI-0.3b2_gp/MANIFEST
new file mode 100644
index 0000000..32043f0
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/MANIFEST
@@ -0,0 +1,61 @@
+DESIGN
+LICENSE
+MANIFEST
+README
+TODO
+setup.py
+examples/process_details.py
+include/arch.h
+include/linux_utils.h
+include/mount.h
+include/posix_mount.h
+include/posix_utils.h
+include/process.h
+include/procfs_utils.h
+include/psi.h
+include/psifuncs.h
+misc/mem_test.py
+misc/mktests.sh
+misc/valgrind-python.supp
+psi/__init__.py
+psi/_version.py
+src/_psimodule.c
+src/arch.c
+src/archmodule.c
+src/mount.c
+src/mountmodule.c
+src/process.c
+src/processmodule.c
+src/processtable.c
+src/timespec.c
+src/util.c
+src/arch/aix_mount.c
+src/arch/aix_process.c
+src/arch/aix_psi.c
+src/arch/darwin_mount.c
+src/arch/darwin_process.c
+src/arch/darwin_processtable.c
+src/arch/darwin_psi.c
+src/arch/getloadavg.c
+src/arch/linux_mount.c
+src/arch/linux_process.c
+src/arch/linux_psi.c
+src/arch/linux_utils.c
+src/arch/posix_arch.c
+src/arch/posix_mount.c
+src/arch/posix_utils.c
+src/arch/procfs_processtable.c
+src/arch/procfs_utils.c
+src/arch/sargs64.c
+src/arch/sunos_mount.c
+src/arch/sunos_process.c
+src/arch/sunos_psi.c
+tests/_psi_test.py
+tests/aixapp.c
+tests/app.c
+tests/apphelper.py
+tests/arch_test.py
+tests/mount_test.py
+tests/process_test.py
+tests/processtable_test.py
+tests/timespec_test.py

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/PKG-INFO
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/PKG-INFO b/tools/bin/pythonSrc/PSI-0.3b2_gp/PKG-INFO
new file mode 100644
index 0000000..f734d64
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/PKG-INFO
@@ -0,0 +1,129 @@
+Metadata-Version: 1.0
+Name: PSI
+Version: 0.3b2
+Summary: Python System Information
+Home-page: http://bitbucket.org/chrismiles/psi
+Author: Chris Miles, Floris Bruynooghe, Erick Tryzelaar
+Author-email: psi-discuss@googlegroups.com
+License: MIT
+Download-URL: http://pypi.python.org/pypi/PSI/
+Description: 
+        -------------------------
+        Python System Information
+        -------------------------
+        
+        ``psi`` is a Python module providing direct access to real-time system
+        and process information.  It is made of of several sub-modules.
+        
+        The ``arch`` module gives some information about the system such as
+        the sytem name and version, the machine architecture etc.  It has a
+        class representing each system and a factory function that will return
+        an instance of the class which ``psi`` is running on currently.
+        
+        The experimental ``mount`` module gives information about the various
+        mounted filesystems on the system.  It has a class representing local
+        or remote filesystems.
+        
+        The ``process`` module provides an interface to information about
+        processes currently running on the system.  Each process is
+        represented as an instance of the ``Process`` class and additionally
+        there is a ``ProcessTable`` class which is a dictionary of all running
+        processes.  To know exactly what attributes are available and what
+        they mean you should look at the docstrings and examples in the
+        ``REAME`` file and ``examples/`` directory, but important to note is
+        that all the information is collected at instatiation time.  So the
+        contents of ``ProcessTable`` and ``Process`` instances are really
+        snapshots and will still contain all information even after the actual
+        process has gone.
+        
+        Lastly there are some general functions available directly under the
+        ``psi`` namespace such as ``loadavg()``, ``getzoneid()`` etc.  Once
+        more see the docstrings for detailed information.
+        
+        Some information may not be available on all platforms, rather then
+        trying to emulate this information these parts of the API just don't
+        exist on those platforms.  Examples of these are:
+        ``psi.process.Process.pcpu`` which is not available on Linux,
+        ``psi.getzoneid()`` which is only available on SunOS 10 and above etc.
+        
+        
+        Supported Platforms
+        ===================
+        
+        Python: 2.2 and above, including 3.x.
+        
+        Linux: all 2.4 and 2.6 kernels.
+        
+        SunOS: Solaris 8 and above, including OpenSolaris (SunOS 11).
+        
+        AIX: 5.3
+        
+        Darwin: 10.3 and above.
+        
+        
+        Documentation
+        =============
+        
+        Care is taken to provide complete and accurate docstrings, so use
+        Python's ``pydoc`` tool and the interactive prompt should get you on
+        your way.
+        
+        We also have a wiki (http://bitbucket.org/chrismiles/psi/wiki/Home)
+        and a mailing list (http://groups.google.com/group/psi-discuss
+        psi-discuss@googlegroups.com).  Don't hesitate to ask questions or
+        give feedback.
+        
+        
+        Bugs
+        ====
+        
+        Please use our issue tracker:
+        http://bitbucket.org/chrismiles/psi/issues
+        
+        
+        Extra setup.py features
+        =======================
+        
+        New ``build_ext`` option: ``--devel``.  This uses ``-Werror`` and
+        enables many more warnings as well as disables optimisation.
+        
+        Using ``--undef PYMALLOC`` or ``-U PYMALLOC`` to ``build_ext`` will
+        use libc's memory heap for allocation instead of Python's.
+        
+        
+        The ``test`` command will run the testsuite.  Some tests will only be
+        run when running the test suite as root.  Currently these are the
+        tests that try to run a simple test application under specific
+        schedulers and priorities to assert psi detects these process
+        attributes correctly.
+        
+        
+        The ``valgrind`` command does run the testsuite under the valgrind
+        memory checker.  For this you need to have a specially compiled
+        python::
+        
+        ./configure --with-pydebug --without-pymalloc --prefix=/opt/pydebug
+        make
+        make install
+        
+        
+        The ``tags`` command will build an emacs TAGS file using ``grind``
+        (which is a binary of the python grin_ package).
+        
+        .. _grin: http://pypi.python.org/pypi/grin
+        
+Platform: UNKNOWN
+Classifier: Development Status :: 4 - Beta
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: AIX
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: POSIX :: SunOS/Solaris
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: System :: Operating System Kernels
+Classifier: Topic :: System :: Systems Administration

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/80e25b46/tools/bin/pythonSrc/PSI-0.3b2_gp/README
----------------------------------------------------------------------
diff --git a/tools/bin/pythonSrc/PSI-0.3b2_gp/README b/tools/bin/pythonSrc/PSI-0.3b2_gp/README
new file mode 100644
index 0000000..139229e
--- /dev/null
+++ b/tools/bin/pythonSrc/PSI-0.3b2_gp/README
@@ -0,0 +1,281 @@
+===============================
+PSI - Python System Information
+===============================
+
+Overview
+========
+
+PSI is a Python module providing direct access to real-time system and
+process information.  It is made up of several sub-modules.
+
+The ``arch`` module gives some information about the system such as the
+system name and version, the machine architecture etc.  It has a class
+representing each system and a factory function that will return an
+instance of the class which psi is running on currently.
+
+The ``process`` module provides an interface to information about
+processes currently running on the system.  Each process is
+represented as an instance of the Process class and additionally there
+is a ProcessTable class which is a dictionary of all running
+processes.  To know exactly what attributes are available and what
+they mean you should look at the docstrings and examples below but
+important to note is that all the information is collected at
+instantiation time.  So the contents of ProcessTable and Process
+instances are really snapshots and will still contain all information
+even after the actual process has gone.
+
+Lastly there are some general functions available directly under the
+``psi`` namespace such as ``loadavg()``, ``getzoneid()`` etc.  Once
+more see the docstrings for detailed information.
+
+Some information may not be available on all platforms, rather then
+trying to emulate this information these parts of the API just don't
+exists on those platforms.  Examples of these are:
+``psi.process.Process.pcpu`` which is not available on Linux,
+``psi.getzoneid()`` which is only available on SunOS 10 and above etc.
+If not all information can be found some attribute descriptors of
+objects might return subclasses of AttributeError, allowing you to use
+generic ``getattr()`` semantics as well as more specifically detect why
+an attribute is not available (insufficient privileges, not
+implemented, ...).
+
+
+Install
+=======
+
+You need to check if it's supported on your platform, check the
+docstring of setup.py for the exact supported platforms.
+
+You will also need a working C compiler and the python development
+files.  If a system is not supported yet the build will fail.  After
+building it is best to run the test suite, on some platforms not all
+tests will pass yet and you would rather know of these problems before
+starting to use PSI.
+
+So to fully install PSI from source::
+
+  $ python setup.py build
+  $ python setup.py test [--all]
+  $ python setup.py install [<your options>]
+
+See the Python documentation on installing Python module for more
+control: http://docs.python.org/install/index.html
+
+
+Here a rough list of packages you will need for some popular GNU/Linux
+distributions, this should give you an idea of what is needed.
+
+Debian/Ubuntu:
+
+ - python
+ - python-dev
+ - gcc
+
+Redhat/CentOS:
+
+ - python
+ - python-devel
+ - gcc
+
+
+
+Limitations
+===========
+
+Solaris
+-------
+
+If the module is compiled as 32-bit (which is what will usually happen
+since that tends to be how python is compiled) it will use the ILP32
+model and will not be able to read the address space of 64-bit
+processes as they use the LP64 model.  This means that the full
+argument list and the environment dictionary will not be available.
+The partial argument list can still be retrieved from the
+``psi.process.Process.command`` attribute.
+
+
+Unit Tests
+==========
+
+To run the unit tests::
+
+  $ python setup.py test [--all]
+
+The --all option will run tests that require superuser privileges,
+this is required to run some test applications under specific
+schedulers and priorities to test if psi does detect these correctly.
+To acquire superuser privileges sudo is used when available, falling
+back to "su -c".
+
+If any tests fail, please copy & paste the output and send operating
+system version, python version, python executable format (32/64-bit)
+and any other applicable details to the mailing list
+(psi-discuss@googlegroups.com).
+
+
+Examples
+========
+
+Examples are the best documentation.  :-)
+
+::
+
+  Python 2.5.2 (r252:60911, Oct  5 2008, 19:24:49)
+  [GCC 4.3.2] on linux2
+  Type "help", "copyright", "credits" or "license" for more information.
+  >>> import psi
+  >>>
+  >>> a = psi.arch.arch_type()
+  >>> a
+  psi.arch.ArchLinux()
+  >>> isinstance(a, psi.arch.ArchLinux)
+  True
+  >>> isinstance(a, psi.arch.ArchSunOS)
+  False
+  >>> a.sysname
+  'Linux'
+  >>> a.nodename
+  'signy'
+  >>> a.release
+  '2.6.27-9-generic'
+  >>> a.release_info
+  (2L, 6L, 26L)
+  >>> a.version
+  '#1 SMP Thu Nov 20 21:57:00 UTC 2008'
+  >>> a.machine
+  'i686'
+  >>>
+  >>> psi.loadavg()
+  (0.059999999999999998, 0.13, 0.13)
+  >>>
+  >>> import os
+  >>> mypid = os.getpid()
+  >>> mypid
+  21374
+  >>> p = psi.process.Process(mypid)
+  >>> p.args
+  ('python',)
+  >>> p.exe
+  '/usr/bin/python2.5'
+  >>> p.uid
+  1000
+  >>> import pwd
+  >>> pwd.getpwuid(p.uid)
+  pwd.struct_passwd(pw_name='flub', pw_passwd='x', pw_uid=1000,
+  pw_gid=1000, pw_gecos='Floris Bruynooghe,,,', pw_dir='/home/flub',
+  pw_shell='/bin/bash')
+  >>> p.start_time
+  datetime.datetime(2009, 5, 11, 20, 4, 31, 709993)
+  >>> help(psi.process.Process.start_time)
+  Help on getset descriptor psi.process.Process.start_time:
+
+  start_time
+    Start time of process as datetime.datetime object
+
+    Use .strftime('%s') to get seconds since epoch
+  >>> p.ppid
+  21304L
+  >>> parent = psi.process.Process(pid=p.ppid)
+  >>> parent.args
+  ('bash',)
+  >>> p.rss
+  2293L
+  >>>
+  >>> pt = psi.process.ProcessTable()
+  >>> len(pt)
+  145
+  >>> pt.keys()
+  [1L, 2L, 3L, 4L, 5L, 6L, 7L, 6152L, 5660L, 6687L, 4648L, 5674L,
+  5639L, 6834L, 46L, 48L, 50L, 51L, 5684L, 1081L, 2621L, 5184L, 6276L,
+  2627L, 6217L, 6220L, 6221L, 4877L, 6224L, 6227L, 6229L, 6235L,
+  6757L, 5729L, 6244L, 2149L, 6246L, 2152L, 2153L, 6763L, 6764L,
+  6254L, 6248L, 6259L, 5749L, 2166L, 6249L, 2169L, 5242L, 2683L,
+  6781L, 6273L, 6274L, 4740L, 6278L, 6279L, 5772L, 5776L, 6290L,
+  5779L, 5268L, 6296L, 6809L, 156L, 157L, 158L, 6341L, 6307L, 2212L,
+  6257L, 20654L, 6856L, 4786L, 4788L, 5814L, 5817L, 6263L, 5821L,
+  6261L, 6336L, 4811L, 118L, 200L, 1739L, 5837L, 1742L, 1743L, 6349L,
+  20693L, 6265L, 5852L, 122L, 4833L, 4834L, 1253L, 2793L, 1258L,
+  6382L, 21374L, 6393L, 5885L, 1278L, 5376L, 6401L, 5675L, 1290L,
+  1293L, 6445L, 5397L, 6262L, 5913L, 4385L, 4386L, 4392L, 4393L,
+  4394L, 1074L, 6449L, 5426L, 6456L, 6458L, 5951L, 19779L, 5963L,
+  5964L, 21327L, 21304L, 5466L, 4955L, 4444L, 2404L, 5501L, 6526L,
+  4992L, 4993L, 6032L, 6572L, 6442L, 5573L, 5576L, 2794L, 17357L,
+  4054L, 6397L, 6136L]
+  >>> q = pt[6856]
+  >>> q.args
+  ('emacs22-gtk',)
+  >>> q.env
+  {'GNOME_DESKTOP_SESSION_ID': 'this-is-deprecated', 'LOGNAME':
+  'flub', 'USER': 'flub', 'HOME': '/home/flub', 'PATH':
+  '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games',
+  'DISPLAY': ':0.0', 'SSH_AGENT_PID': '6217', 'LANG': 'en_GB.UTF-8',
+  'SHELL': '/bin/bash', 'XDG_SESSION_COOKIE':
+  '26100f517aa94dec0dc7b4aa494948ea-1231927234.615881-1120709651',
+  'SESSION_MANAGER': 'local/signy:/tmp/.ICE-unix/6152',
+  'XDG_DATA_DIRS': '/usr/local/share/:/usr/share/:/usr/share/gdm/',
+  'WINDOWPATH': '7', 'GPG_AGENT_INFO':
+  '/tmp/seahorse-SWS6qW/S.gpg-agent:6235:1', 'USERNAME': 'flub',
+  'GDM_XSERVER_LOCATION': 'local', 'SSH_AUTH_SOCK':
+  '/tmp/keyring-AStyaD/ssh', 'DESKTOP_SESSION': 'gnome', 'GDMSESSION':
+  'gnome', 'DBUS_SESSION_BUS_ADDRESS':
+  'unix:abstract=/tmp/dbus-HFM6sYANYS,guid=916b5b81c03bda327eaaa293496db7c5',
+  'ORBIT_SOCKETDIR': '/tmp/orbit-flub', 'XAUTHORITY':
+  '/home/flub/.Xauthority', 'GNOME_KEYRING_SOCKET':
+  '/tmp/keyring-AStyaD/socket', 'GNOME_KEYRING_PID': '6246\n',
+  'GDM_LANG': 'en_GB.UTF-8', 'PWD': '/home/flub', 'GTK_RC_FILES':
+  '/etc/gtk/gtkrc:/home/flub/.gtkrc-1.2-gnome2'}
+  >>>
+  >>> r = []
+  >>> for pp in pt.values():
+  ...     if pp.args and 'evolution' in pp.args[0]:
+  ...         r.append(pp)
+  ...
+  >>> r
+  [psi.process.Process(pid=6336), psi.process.Process(pid=6572),
+  psi.process.Process(pid=6397)]
+  >>> for pp in r:
+  ...     print ' '.join(pp.args)
+  ...
+  /usr/lib/evolution/2.24/evolution-exchange-storage --oaf-activate-iid=OAFIID:GNOME_Evolution_Exchange_Connector_CalFactory:1.2 --oaf-ior-fd=22
+  evolution
+  /usr/lib/evolution/evolution-data-server-2.24 --oaf-activate-iid=OAFIID:GNOME_Evolution_DataServer_CalFactory:1.2 --oaf-ior-fd=23
+  >>>
+  >>> init = psi.process.Process(1)
+  >>> init.env
+  Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+  psi.InsufficientPrivsError: Insufficient privileges for Process.env
+  >>>
+
+Some sample scripts are also available in the examples/ subdirectory.
+
+
+Development
+===========
+
+You should definitely read the DESIGN file, it will explain the ideas
+behind how the code is structured and how you should extend it.  Also
+very useful is the TODO file and the wiki
+(http://bitbucket.org/chrismiles/psi/wiki/Development).
+
+The setup.py has some extra features that help with development,
+these are explained in its docstring.
+
+The PSI source code is hosted at bitbucket in a Mercurial repository,
+http://bitbucket.org/chrismiles/psi/
+
+
+Porting
+=======
+
+If PSI does not yet work on your architecture/system then it needs to
+be ported, the design is aimed at making this easy for you.  There is
+no separate porting guide at the moment so you will have to read the
+DESIGN file for details of what to do.
+
+
+Help
+====
+
+There is a mailing list at psi-discuss@googlegroups.com where you can
+post questions, patches, hints, bugs, etc.  Your feedback is welcome!



Mime
View raw message