Author: mahadev
Date: Fri Apr 24 18:26:08 2009
New Revision: 768390
URL: http://svn.apache.org/viewvc?rev=768390&view=rev
Log:
ZOOKEEPER-29. Flexible quorums (flavio via mahadev)
Added:
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumHierarchical.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumMaj.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumVerifier.java
hadoop/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/HierarchicalQuorumTest.java
Modified:
hadoop/zookeeper/trunk/CHANGES.txt
hadoop/zookeeper/trunk/docs/index.html
hadoop/zookeeper/trunk/docs/javaExample.html
hadoop/zookeeper/trunk/docs/linkmap.html
hadoop/zookeeper/trunk/docs/recipes.html
hadoop/zookeeper/trunk/docs/releasenotes.html
hadoop/zookeeper/trunk/docs/zookeeperAdmin.html
hadoop/zookeeper/trunk/docs/zookeeperAdmin.pdf
hadoop/zookeeper/trunk/docs/zookeeperInternals.html
hadoop/zookeeper/trunk/docs/zookeeperInternals.pdf
hadoop/zookeeper/trunk/docs/zookeeperJMX.html
hadoop/zookeeper/trunk/docs/zookeeperOver.html
hadoop/zookeeper/trunk/docs/zookeeperProgrammers.html
hadoop/zookeeper/trunk/docs/zookeeperQuotas.html
hadoop/zookeeper/trunk/docs/zookeeperStarted.html
hadoop/zookeeper/trunk/docs/zookeeperTutorial.html
hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml
hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperInternals.xml
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AckRequestProcessor.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AuthFastLeaderElection.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Follower.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FollowerHandler.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java
hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerMain.java
Modified: hadoop/zookeeper/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/CHANGES.txt?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/CHANGES.txt (original)
+++ hadoop/zookeeper/trunk/CHANGES.txt Fri Apr 24 18:26:08 2009
@@ -112,6 +112,7 @@
ZOOKEEPER-78. added a high level protocol/feature - for easy Leader
Election or exclusive Write Lock creation (mahadev via phunt)
+ ZOOKEEPER-29. Flexible quorums (flavio via mahadev)
Release 3.1.0 - 2009-02-06
Modified: hadoop/zookeeper/trunk/docs/index.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/index.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/index.html (original)
+++ hadoop/zookeeper/trunk/docs/index.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/javaExample.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/javaExample.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/javaExample.html (original)
+++ hadoop/zookeeper/trunk/docs/javaExample.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/linkmap.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/linkmap.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/linkmap.html (original)
+++ hadoop/zookeeper/trunk/docs/linkmap.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/recipes.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/recipes.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/recipes.html (original)
+++ hadoop/zookeeper/trunk/docs/recipes.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/releasenotes.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/releasenotes.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/releasenotes.html (original)
+++ hadoop/zookeeper/trunk/docs/releasenotes.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperAdmin.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperAdmin.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperAdmin.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperAdmin.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
@@ -1107,10 +1107,38 @@
with ZooKeeper. If followers fall too far behind a leader, they
will be dropped.</p>
</dd>
+
+
+<dt>
+<term>group.x=nnnnn[:nnnnn]</term>
+</dt>
+<dd>
+<p>(No Java system property)</p>
+<p>Enables a hierarchical quorum construction."x" is a group identifier
+ and the numbers following the "=" sign correspond to server identifiers.
+ The left-hand side of the assignment is a colon-separated list of server
+ identifiers. Note that groups must be disjoint and the union of all groups
+ must be the ZooKeeper ensemble. </p>
+</dd>
+
+
+<dt>
+<term>weight.x=nnnnn</term>
+</dt>
+<dd>
+<p>(No Java system property)</p>
+<p>Used along with "group", it assigns a weight to a server when
+ forming quorums. Such a value corresponds to the weight of a server
+ when voting. There are a few parts of ZooKeeper that require voting
+ such as leader election and the atomic broadcast protocol. By default
+ the weight of server is 1. If the configuration defines groups, but not
+ weights, then a value of 1 will be assigned to all servers.
+ </p>
+</dd>
</dl>
<p></p>
-<a name="N10368"></a><a name="Unsafe+Options"></a>
+<a name="N1037A"></a><a name="Unsafe+Options"></a>
<h4>Unsafe Options</h4>
<p>The following options can be useful, but be careful when you use
them. The risk of each is explained along with the explanation of what
@@ -1155,7 +1183,7 @@
</dd>
</dl>
-<a name="N1039A"></a><a name="sc_zkCommands"></a>
+<a name="N103AC"></a><a name="sc_zkCommands"></a>
<h3 class="h4">ZooKeeper Commands: The Four Letter Words</h3>
<p>ZooKeeper responds to a small set of commands. Each command is
composed of four letters. You issue the commands to ZooKeeper via telnet
@@ -1180,15 +1208,6 @@
<dt>
-<term>kill</term>
-</dt>
-<dd>
-<p>Shuts down the server. This must be issued from the machine
- the ZooKeeper server is running on.</p>
-</dd>
-
-
-<dt>
<term>reqs</term>
</dt>
<dd>
@@ -1228,7 +1247,7 @@
<pre class="code">$ echo ruok | nc 127.0.0.1 5111
imok
</pre>
-<a name="N103E1"></a><a name="sc_dataFileManagement"></a>
+<a name="N103EC"></a><a name="sc_dataFileManagement"></a>
<h3 class="h4">Data File Management</h3>
<p>ZooKeeper stores its data in a data directory and its transaction
log in a transaction log directory. By default these two directories are
@@ -1236,7 +1255,7 @@
transaction log files in a separate directory than the data files.
Throughput increases and latency decreases when transaction logs reside
on a dedicated log devices.</p>
-<a name="N103EA"></a><a name="The+Data+Directory"></a>
+<a name="N103F5"></a><a name="The+Data+Directory"></a>
<h4>The Data Directory</h4>
<p>This directory has two files in it:</p>
<ul>
@@ -1282,14 +1301,14 @@
idempotent nature of its updates. By replaying the transaction log
against fuzzy snapshots ZooKeeper gets the state of the system at the
end of the log.</p>
-<a name="N10426"></a><a name="The+Log+Directory"></a>
+<a name="N10431"></a><a name="The+Log+Directory"></a>
<h4>The Log Directory</h4>
<p>The Log Directory contains the ZooKeeper transaction logs.
Before any update takes place, ZooKeeper ensures that the transaction
that represents the update is written to non-volatile storage. A new
log file is started each time a snapshot is begun. The log file's
suffix is the first zxid written to that log.</p>
-<a name="N10430"></a><a name="sc_filemanagement"></a>
+<a name="N1043B"></a><a name="sc_filemanagement"></a>
<h4>File Management</h4>
<p>The format of snapshot and log files does not change between
standalone ZooKeeper servers and different configurations of
@@ -1309,7 +1328,7 @@
this document for more details on setting a retention policy
and maintenance of ZooKeeper storage.
</p>
-<a name="N10445"></a><a name="sc_commonProblems"></a>
+<a name="N10450"></a><a name="sc_commonProblems"></a>
<h3 class="h4">Things to Avoid</h3>
<p>Here are some common problems you can avoid by configuring
ZooKeeper correctly:</p>
@@ -1363,7 +1382,7 @@
</dd>
</dl>
-<a name="N10469"></a><a name="sc_bestPractices"></a>
+<a name="N10474"></a><a name="sc_bestPractices"></a>
<h3 class="h4">Best Practices</h3>
<p>For best results, take note of the following list of good
Zookeeper practices. <em>[tbd...]</em>
Modified: hadoop/zookeeper/trunk/docs/zookeeperAdmin.pdf
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperAdmin.pdf?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
Binary files - no diff available.
Modified: hadoop/zookeeper/trunk/docs/zookeeperInternals.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperInternals.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperInternals.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperInternals.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
@@ -211,6 +211,9 @@
</ul>
</li>
<li>
+<a href="#sc_quorum">Quorums</a>
+</li>
+<li>
<a href="#sc_logging">Logging</a>
<ul class="minitoc">
<li>
@@ -617,8 +620,42 @@
</div>
+<a name="N10134"></a><a name="sc_quorum"></a>
+<h2 class="h3">Quorums</h2>
+<div class="section">
+<p>
+Atomic broadcast and leader election use the notion of quorum to guarantee a consistent
+view of the system. By default, ZooKeeper uses majority quorums, which means that every
+voting that happens in one of these protocols requires a majority to vote on. One example is
+acknowledging a leader proposal: the leader can only commit once it receives an
+acknowledgement from a quorum of servers.
+</p>
+<p>
+If we extract the properties that we really need from our use of majorities, we have that we only
+need to guarantee that groups of processes used to validate an operation by voting (e.g., acknowledging
+a leader proposal) pairwise intersect in at least one server. Using majorities guarantees such a property.
+However, there are other ways of constructing quorums different from majorities. For example, we can assign
+weights to the votes of servers, and say that the votes of some servers are more important. To obtain a quorum,
+we get enough votes so that the sum of weights of all votes is larger than half of the total sum of all weights.
+</p>
+<p>
+A different construction that uses weights and is useful in wide-area deployments (co-locations) is a hierarchical
+one. With this construction, we split the servers into disjoint groups and assign weights to processes. To form
+a quorum, we have to get a hold of enough servers from a majority of groups G, such that for each group g in G,
+the sum of votes from g is larger than half of the sum of weights in g. Interestingly, this construction enables
+smaller quorums. If we have, for example, 9 servers, we split them into 3 groups, and assign a weight of 1 to each
+server, then we are able to form quorums of size 4. Note that two subsets of processes composed each of a majority
+of servers from each of a majority of groups necessarily have a non-empty intersection. It is reasonable to expect
+that a majority of co-locations will have a majority of servers available with high probability.
+</p>
+<p>
+With ZooKeeper, we provide a user with the ability of configuring servers to use majority quorums, weights, or a
+hierarchy of groups.
+</p>
+</div>
+
-<a name="N10134"></a><a name="sc_logging"></a>
+<a name="N10147"></a><a name="sc_logging"></a>
<h2 class="h3">Logging</h2>
<div class="section">
<p>
@@ -629,10 +666,10 @@
of the <a href="zookeeperAdmin.html">ZooKeeper Administrator's Guide.</a>
</p>
-<a name="N10149"></a><a name="sc_developerGuidelines"></a>
+<a name="N1015C"></a><a name="sc_developerGuidelines"></a>
<h3 class="h4">Developer Guidelines</h3>
<p>Please follow these guidelines when submitting code. Patch reviewers will look for the following:</p>
-<a name="N10151"></a><a name="sc_rightLevel"></a>
+<a name="N10164"></a><a name="sc_rightLevel"></a>
<h4>Logging at the Right Level</h4>
<p>
There are <a href="http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/Level.html#FATAL">6 levels of logging in log4j</a>.
@@ -667,7 +704,7 @@
<p>
ZooKeeper is typically run in production such that log messages of INFO level
severity and higher (more severe) are output to the log.</p>
-<a name="N1017C"></a><a name="sc_log4jIdioms"></a>
+<a name="N1018F"></a><a name="sc_log4jIdioms"></a>
<h4>Use of Standard log4j Idioms</h4>
<p>
<em>Static Message Logging</em>
Modified: hadoop/zookeeper/trunk/docs/zookeeperInternals.pdf
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperInternals.pdf?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
Binary files - no diff available.
Modified: hadoop/zookeeper/trunk/docs/zookeeperJMX.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperJMX.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperJMX.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperJMX.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperOver.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperOver.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperOver.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperOver.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperProgrammers.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperProgrammers.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperProgrammers.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperProgrammers.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperQuotas.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperQuotas.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperQuotas.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperQuotas.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperStarted.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperStarted.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperStarted.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperStarted.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/docs/zookeeperTutorial.html
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/docs/zookeeperTutorial.html?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/docs/zookeeperTutorial.html (original)
+++ hadoop/zookeeper/trunk/docs/zookeeperTutorial.html Fri Apr 24 18:26:08 2009
@@ -67,7 +67,7 @@
<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
</li>
<li class="current">
-<a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
+<a class="selected" href="index.html">ZooKeeper 3.2 Documentation</a>
</li>
</ul>
<!--+
Modified: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml (original)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml Fri Apr 24 18:26:08 2009
@@ -775,8 +775,37 @@
will be dropped.</para>
</listitem>
</varlistentry>
- </variablelist>
+ <varlistentry>
+ <term>group.x=nnnnn[:nnnnn]</term>
+
+ <listitem>
+ <para>(No Java system property)</para>
+
+ <para>Enables a hierarchical quorum construction."x" is a group identifier
+ and the numbers following the "=" sign correspond to server identifiers.
+ The left-hand side of the assignment is a colon-separated list of server
+ identifiers. Note that groups must be disjoint and the union of all groups
+ must be the ZooKeeper ensemble. </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>weight.x=nnnnn</term>
+
+ <listitem>
+ <para>(No Java system property)</para>
+
+ <para>Used along with "group", it assigns a weight to a server when
+ forming quorums. Such a value corresponds to the weight of a server
+ when voting. There are a few parts of ZooKeeper that require voting
+ such as leader election and the atomic broadcast protocol. By default
+ the weight of server is 1. If the configuration defines groups, but not
+ weights, then a value of 1 will be assigned to all servers.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
<para></para>
</section>
Modified: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperInternals.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperInternals.xml?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperInternals.xml (original)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperInternals.xml Fri Apr 24 18:26:08 2009
@@ -356,6 +356,42 @@
</section>
+<section id="sc_quorum">
+<title>Quorums</title>
+
+<para>
+Atomic broadcast and leader election use the notion of quorum to guarantee a consistent
+view of the system. By default, ZooKeeper uses majority quorums, which means that every
+voting that happens in one of these protocols requires a majority to vote on. One example is
+acknowledging a leader proposal: the leader can only commit once it receives an
+acknowledgement from a quorum of servers.
+</para>
+
+<para>
+If we extract the properties that we really need from our use of majorities, we have that we only
+need to guarantee that groups of processes used to validate an operation by voting (e.g., acknowledging
+a leader proposal) pairwise intersect in at least one server. Using majorities guarantees such a property.
+However, there are other ways of constructing quorums different from majorities. For example, we can assign
+weights to the votes of servers, and say that the votes of some servers are more important. To obtain a quorum,
+we get enough votes so that the sum of weights of all votes is larger than half of the total sum of all weights.
+</para>
+
+<para>
+A different construction that uses weights and is useful in wide-area deployments (co-locations) is a hierarchical
+one. With this construction, we split the servers into disjoint groups and assign weights to processes. To form
+a quorum, we have to get a hold of enough servers from a majority of groups G, such that for each group g in G,
+the sum of votes from g is larger than half of the sum of weights in g. Interestingly, this construction enables
+smaller quorums. If we have, for example, 9 servers, we split them into 3 groups, and assign a weight of 1 to each
+server, then we are able to form quorums of size 4. Note that two subsets of processes composed each of a majority
+of servers from each of a majority of groups necessarily have a non-empty intersection. It is reasonable to expect
+that a majority of co-locations will have a majority of servers available with high probability.
+</para>
+
+<para>
+With ZooKeeper, we provide a user with the ability of configuring servers to use majority quorums, weights, or a
+hierarchy of groups.
+</para>
+</section>
<section id="sc_logging">
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AckRequestProcessor.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AckRequestProcessor.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AckRequestProcessor.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AckRequestProcessor.java Fri Apr 24 18:26:08 2009
@@ -36,7 +36,7 @@
* Forward the request as an ACK to the leader
*/
public void processRequest(Request request) {
- leader.processAck(request.zxid, null);
+ leader.processAck(leader.self.getId(), request.zxid, null);
}
public void shutdown() {
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AuthFastLeaderElection.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AuthFastLeaderElection.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AuthFastLeaderElection.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/AuthFastLeaderElection.java Fri Apr 24 18:26:08 2009
@@ -789,8 +789,9 @@
private boolean termPredicate(HashMap<InetSocketAddress, Vote> votes,
long l, long zxid) {
- int count = 0;
+
Collection<Vote> votesCast = votes.values();
+ int count = 0;
/*
* First make the views consistent. Sometimes peers will have different
* zxids for a server depending on timing.
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java Fri Apr 24 18:26:08 2009
@@ -21,7 +21,9 @@
import java.nio.ByteBuffer;
import java.util.Collection;
+import java.util.Map;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
@@ -454,19 +456,19 @@
HashMap<Long, Vote> votes,
Vote vote) {
- Collection<Vote> votesCast = votes.values();
- int count = 0;
+ HashSet<Long> set = new HashSet<Long>();
/*
* First make the views consistent. Sometimes peers will have
* different zxids for a server depending on timing.
*/
- for (Vote v : votesCast) {
- if (v.equals(vote))
- count++;
+ for (Map.Entry<Long,Vote> entry : votes.entrySet()) {
+ if (vote.equals(entry.getValue())){
+ set.add(entry.getKey());
+ }
}
- if (count > (self.quorumPeers.size() / 2))
+ if(self.getQuorumVerifier().containsQuorum(set))
return true;
else
return false;
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Follower.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Follower.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Follower.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Follower.java Fri Apr 24 18:26:08 2009
@@ -170,10 +170,23 @@
sock.getInputStream()));
bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
+
+ /*
+ * Send follower info, including last zxid and sid
+ */
QuorumPacket qp = new QuorumPacket();
- qp.setType(Leader.LASTZXID);
+ qp.setType(Leader.FOLLOWERINFO);
long sentLastZxid = self.getLastLoggedZxid();
qp.setZxid(sentLastZxid);
+
+ /*
+ * Add sid to payload
+ */
+ ByteArrayOutputStream bsid = new ByteArrayOutputStream();
+ DataOutputStream dsid = new DataOutputStream(bsid);
+ dsid.writeLong(self.getId());
+ qp.setData(bsid.toByteArray());
+
writePacket(qp, true);
readPacket(qp);
long newLeaderZxid = qp.getZxid();
@@ -222,6 +235,7 @@
writePacket(ack, true);
sock.setSoTimeout(self.tickTime * self.syncLimit);
zk.startup();
+
while (self.running) {
readPacket(qp);
switch (qp.getType()) {
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FollowerHandler.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FollowerHandler.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FollowerHandler.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FollowerHandler.java Fri Apr 24 18:26:08 2009
@@ -52,6 +52,15 @@
final Leader leader;
long tickOfLastAck;
+
+ /**
+ * ZooKeeper server identifier of this follower
+ */
+ protected long sid = 0;
+
+ long getSid(){
+ return sid;
+ }
/**
* The packets to be sent to the follower
@@ -129,6 +138,7 @@
String type = null;
String mess = null;
Record txn = null;
+
switch (p.getType()) {
case Leader.ACK:
type = "ACK";
@@ -136,8 +146,8 @@
case Leader.COMMIT:
type = "COMMIT";
break;
- case Leader.LASTZXID:
- type = "LASTZXID";
+ case Leader.FOLLOWERINFO:
+ type = "FOLLOWERINFO";
break;
case Leader.NEWLEADER:
type = "NEWLEADER";
@@ -200,12 +210,21 @@
QuorumPacket qp = new QuorumPacket();
ia.readRecord(qp, "packet");
- if (qp.getType() != Leader.LASTZXID) {
- LOG.error("First packet " + qp.toString()
- + " is not LASTZXID!");
+ if(qp.getType() != leader.FOLLOWERINFO){
+ LOG.error("First packet " + qp.toString()
+ + " is not FOLLOWERINFO!");
return;
}
+ if (qp.getData() != null) {
+ ByteBuffer bbsid = ByteBuffer.wrap(qp.getData());
+ this.sid = bbsid.getLong();
+ } else {
+ this.sid = leader.followerCounter.getAndDecrement();
+ }
+ LOG.info("The follower sid: " + this.sid);
+
long peerLastZxid = qp.getZxid();
+
int packetToSend = Leader.SNAP;
boolean logTxns = true;
@@ -307,7 +326,7 @@
switch (qp.getType()) {
case Leader.ACK:
- leader.processAck(qp.getZxid(), sock.getLocalSocketAddress());
+ leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
break;
case Leader.PING:
// Process the touches
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java Fri Apr 24 18:26:08 2009
@@ -20,6 +20,7 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.lang.StringBuffer;
import java.net.BindException;
import java.net.ServerSocket;
import java.net.Socket;
@@ -31,6 +32,7 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
@@ -55,7 +57,7 @@
static public class Proposal {
public QuorumPacket packet;
- public int ackCount;
+ public HashSet<Long> ackSet = new HashSet<Long>();
public Request request;
@@ -80,7 +82,9 @@
//Pending sync requests
public HashMap<Long,List<FollowerSyncRequest>> pendingSyncs = new HashMap<Long,List<FollowerSyncRequest>>();
-
+
+ //Follower counter
+ AtomicLong followerCounter = new AtomicLong(-1);
/**
* Adds follower to the leader.
*
@@ -149,10 +153,10 @@
final static int NEWLEADER = 10;
/**
- * This message type is sent by a follower to indicate the last zxid in its
- * log.
+ * This message type is sent by a follower to pass the last zxid. This is here
+ * for backward compatibility purposes.
*/
- final static int LASTZXID = 11;
+ final static int FOLLOWERINFO = 11;
/**
* This message type is sent by the leader to indicate that the follower is
@@ -198,7 +202,7 @@
* between the leader and the follower.
*/
final static int SYNC = 7;
-
+
private ConcurrentMap<Long, Proposal> outstandingProposals = new ConcurrentHashMap<Long, Proposal>();
ConcurrentLinkedQueue<Proposal> toBeApplied = new ConcurrentLinkedQueue<Proposal>();
@@ -275,15 +279,23 @@
// We have to get at least a majority of servers in sync with
// us. We do this by waiting for the NEWLEADER packet to get
// acknowledged
- newLeaderProposal.ackCount++;
- while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {
+ newLeaderProposal.ackSet.add(self.getId());
+ while (!self.getQuorumVerifier().containsQuorum(newLeaderProposal.ackSet)){
+ //while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {
if (self.tick > self.initLimit) {
// Followers aren't syncing fast enough,
// renounce leadership!
- shutdown("Waiting for " + (self.quorumPeers.size() / 2)
- + " followers, only synced with "
- + newLeaderProposal.ackCount);
- if (followers.size() >= self.quorumPeers.size() / 2) {
+ StringBuffer ackToString = new StringBuffer();
+ for(Long id : newLeaderProposal.ackSet)
+ ackToString.append(id + ": ");
+
+ shutdown("Waiting for a quorum of followers, only synced with: " + ackToString);
+ HashSet<Long> followerSet = new HashSet<Long>();
+ for(FollowerHandler f : followers)
+ followerSet.add(f.getSid());
+
+ if (self.getQuorumVerifier().containsQuorum(followerSet)) {
+ //if (followers.size() >= self.quorumPeers.size() / 2) {
LOG.warn("Enough followers present. "+
"Perhaps the initTicks need to be increased.");
}
@@ -312,24 +324,29 @@
self.tick++;
}
int syncedCount = 0;
+ HashSet<Long> syncedSet = new HashSet<Long>();
+
// lock on the followers when we use it.
+ syncedSet.add(self.getId());
synchronized (followers) {
for (FollowerHandler f : followers) {
if (f.synced()) {
syncedCount++;
+ syncedSet.add(f.getSid());
}
f.ping();
}
}
- if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {
+ if (!tickSkip && !self.getQuorumVerifier().containsQuorum(syncedSet)) {
+ //if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {
// Lost quorum, shutdown
shutdown("Only " + syncedCount + " followers, need "
+ (self.quorumPeers.size() / 2));
// make sure the order is the same!
// the leader goes to looking
return;
- }
- tickSkip = !tickSkip;
+ }
+ tickSkip = !tickSkip;
}
} finally {
zk.unregisterJMX(this);
@@ -385,7 +402,7 @@
* the zxid of the proposal sent out
* @param followerAddr
*/
- synchronized public void processAck(long zxid, SocketAddress followerAddr) {
+ synchronized public void processAck(long sid, long zxid, SocketAddress followerAddr) {
boolean first = true;
if (LOG.isDebugEnabled()) {
@@ -419,19 +436,18 @@
+ Long.toHexString(zxid) + " from " + followerAddr);
return;
}
- p.ackCount++;
+
+ p.ackSet.add(sid);
if (LOG.isDebugEnabled()) {
LOG.debug("Count for zxid: 0x" + Long.toHexString(zxid)
- + " is " + p.ackCount);
+ + " is " + p.ackSet.size());
}
-
- if (p.ackCount > self.quorumPeers.size() / 2){
+ if (self.getQuorumVerifier().containsQuorum(p.ackSet)){
if (zxid != lastCommitted+1) {
LOG.warn("Commiting zxid 0x" + Long.toHexString(zxid)
+ " from " + followerAddr + " not first!");
LOG.warn("First is "
+ (lastCommitted+1));
- //System.exit(13);
}
outstandingProposals.remove(zxid);
if (p.request != null) {
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java Fri Apr 24 18:26:08 2009
@@ -34,6 +34,8 @@
import org.apache.zookeeper.server.NIOServerCnxn;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
+import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.quorum.flexible.QuorumMaj;
/**
* This class manages the quorum protocol. There are three states this server
@@ -108,6 +110,13 @@
public int getQuorumSize(){
return quorumPeers.size();
}
+
+ /**
+ * QuorumVerifier implementation; default (majority).
+ */
+
+ private QuorumVerifier quorumConfig;
+
/**
* My id
*/
@@ -259,10 +268,24 @@
quorumStats = new QuorumStats(this);
}
+
+ /**
+ * For backward compatibility purposes, we instantiate QuorumMaj by default.
+ */
+
public QuorumPeer(Map<Long, QuorumServer> quorumPeers, File dataDir,
File dataLogDir, int electionType,
long myid, int tickTime, int initLimit, int syncLimit,
NIOServerCnxn.Factory cnxnFactory) throws IOException {
+ this(quorumPeers, dataDir, dataLogDir, electionType, myid, tickTime,
+ initLimit, syncLimit, cnxnFactory, new QuorumMaj(quorumPeers.size()));
+ }
+
+ public QuorumPeer(Map<Long, QuorumServer> quorumPeers, File dataDir,
+ File dataLogDir, int electionType,
+ long myid, int tickTime, int initLimit, int syncLimit,
+ NIOServerCnxn.Factory cnxnFactory,
+ QuorumVerifier quorumConfig) throws IOException {
this();
this.cnxnFactory = cnxnFactory;
this.quorumPeers = quorumPeers;
@@ -272,6 +295,9 @@
this.initLimit = initLimit;
this.syncLimit = syncLimit;
this.logFactory = new FileTxnSnapLog(dataLogDir, dataDir);
+ if(quorumConfig == null)
+ this.quorumConfig = new QuorumMaj(quorumPeers.size());
+ else this.quorumConfig = quorumConfig;
}
QuorumStats quorumStats() {
@@ -313,6 +339,7 @@
this.electionAlg = createElectionAlgorithm(electionType);
}
+
/**
* This constructor is only used by the existing unit test code.
* It defaults to FileLogProvider persistence provider.
@@ -324,7 +351,23 @@
{
this(quorumPeers, snapDir, logDir, electionAlg,
myid,tickTime, initLimit,syncLimit,
- new NIOServerCnxn.Factory(clientPort));
+ new NIOServerCnxn.Factory(clientPort),
+ new QuorumMaj(quorumPeers.size()));
+ }
+
+ /**
+ * This constructor is only used by the existing unit test code.
+ * It defaults to FileLogProvider persistence provider.
+ */
+ public QuorumPeer(Map<Long,QuorumServer> quorumPeers, File snapDir,
+ File logDir, int clientPort, int electionAlg,
+ long myid, int tickTime, int initLimit, int syncLimit,
+ QuorumVerifier quorumConfig)
+ throws IOException
+ {
+ this(quorumPeers, snapDir, logDir, electionAlg,
+ myid,tickTime, initLimit,syncLimit,
+ new NIOServerCnxn.Factory(clientPort), quorumConfig);
}
public long getLastLoggedZxid(){
@@ -577,6 +620,19 @@
}
/**
+ * Return QuorumVerifier object
+ */
+
+ public QuorumVerifier getQuorumVerifier(){
+ return quorumConfig;
+
+ }
+
+ public void setQuorumVerifier(QuorumVerifier quorumConfig){
+ this.quorumConfig = quorumConfig;
+ }
+
+ /**
* Get an instance of LeaderElection
*/
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java Fri Apr 24 18:26:08 2009
@@ -33,6 +33,9 @@
import org.apache.log4j.Logger;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
+import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.quorum.flexible.QuorumMaj;
+import org.apache.zookeeper.server.quorum.flexible.QuorumHierarchical;
public class QuorumPeerConfig {
private static final Logger LOG = Logger.getLogger(QuorumPeerConfig.class);
@@ -50,6 +53,10 @@
new HashMap<Long, QuorumServer>();
protected long serverId;
+ protected HashMap<Long, Long> serverWeight = new HashMap<Long, Long>();
+ protected HashMap<Long, Long> serverGroup = new HashMap<Long, Long>();
+ protected int numGroups = 0;
+ protected QuorumVerifier quorumVerifier;
@SuppressWarnings("serial")
public static class ConfigException extends Exception {
@@ -93,7 +100,8 @@
}
}
- protected void parseProperties(Properties zkProp) throws IOException {
+ protected void parseProperties(Properties zkProp)
+ throws IOException, ConfigException {
for (Entry<Object, Object> entry : zkProp.entrySet()) {
String key = entry.getKey().toString();
String value = entry.getValue().toString();
@@ -129,6 +137,25 @@
servers.put(Long.valueOf(sid), new QuorumServer(sid, addr,
electionAddr));
}
+ } else if (key.startsWith("group")) {
+ int dot = key.indexOf('.');
+ long gid = Long.parseLong(key.substring(dot + 1));
+
+ numGroups++;
+
+ String parts[] = value.split(":");
+ for(String s : parts){
+ long sid = Long.parseLong(s);
+ if(serverGroup.containsKey(sid))
+ throw new ConfigException("Server " + sid + "is in multiple groups");
+ else
+ serverGroup.put(sid, gid);
+ }
+
+ } else if(key.startsWith("weight")) {
+ int dot = key.indexOf('.');
+ long sid = Long.parseLong(key.substring(dot + 1));
+ serverWeight.put(sid, Long.parseLong(value));
} else {
System.setProperty("zookeeper." + key, value);
}
@@ -169,6 +196,34 @@
}
}
+ /*
+ * Default of quorum config is majority
+ */
+ if(serverGroup.size() > 0){
+ if(servers.size() != serverGroup.size())
+ throw new ConfigException("Every server must be in exactly one group");
+ /*
+ * The deafult weight of a server is 1
+ */
+ for(QuorumServer s : servers.values()){
+ if(!serverWeight.containsKey(s.id))
+ serverWeight.put(s.id, (long) 1);
+ }
+
+ /*
+ * Set the quorumVerifier to be QuorumHierarchical
+ */
+ quorumVerifier = new QuorumHierarchical(numGroups,
+ serverWeight, serverGroup);
+ } else {
+ /*
+ * The default QuorumVerifier is QuorumMaj
+ */
+
+ LOG.info("Defaulting to majority quorums");
+ quorumVerifier = new QuorumMaj(servers.size());
+ }
+
File myIdFile = new File(dataDir, "myid");
if (!myIdFile.exists()) {
throw new IllegalArgumentException(myIdFile.toString()
@@ -199,7 +254,10 @@
public int getSyncLimit() { return syncLimit; }
public int getElectionAlg() { return electionAlg; }
public int getElectionPort() { return electionPort; }
-
+ public QuorumVerifier getQuorumVerifier() {
+ return quorumVerifier;
+ }
+
public Map<Long,QuorumServer> getServers() {
return Collections.unmodifiableMap(servers);
}
Modified: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerMain.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerMain.java?rev=768390&r1=768389&r2=768390&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerMain.java (original)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeerMain.java Fri Apr 24 18:26:08 2009
@@ -121,6 +121,7 @@
quorumPeer.setTickTime(config.getTickTime());
quorumPeer.setInitLimit(config.getInitLimit());
quorumPeer.setSyncLimit(config.getSyncLimit());
+ quorumPeer.setQuorumVerifier(config.getQuorumVerifier());
quorumPeer.setCnxnFactory(cnxnFactory);
quorumPeer.start();
Added: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumHierarchical.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumHierarchical.java?rev=768390&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumHierarchical.java (added)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumHierarchical.java Fri Apr 24 18:26:08 2009
@@ -0,0 +1,249 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server.quorum.flexible;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Properties;
+import java.util.Map.Entry;
+
+
+import org.apache.log4j.Logger;
+
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
+
+
+/**
+ * This class implements a validator for hierarchical quorums. With this
+ * construction, zookeeper servers are split into disjoint groups, and
+ * each server has a weight. We obtain a quorum if we get more than half
+ * of the total weight of a group for a majority of groups.
+ *
+ * The configuration of quorums uses two parameters: group and weight.
+ * Groups are sets of ZooKeeper servers, and we set a group by passing
+ * a colon-separated list of server ids. It is also necessary to assign
+ * weights to server. Here is an example of a configuration that creates
+ * three groups and assigns a weight of 1 to each server:
+ *
+ * group.1=1:2:3
+ * group.2=4:5:6
+ * group.3=7:8:9
+ *
+ * weight.1=1
+ * weight.2=1
+ * weight.3=1
+ * weight.4=1
+ * weight.5=1
+ * weight.6=1
+ * weight.7=1
+ * weight.8=1
+ * weight.9=1
+ *
+ * Note that it is still necessary to define peers using the server keyword.
+ */
+
+public class QuorumHierarchical implements QuorumVerifier {
+ private static final Logger LOG = Logger.getLogger(QuorumHierarchical.class);
+
+ HashMap<Long, Long> serverWeight;
+ HashMap<Long, Long> serverGroup;
+ HashMap<Long, Long> groupWeight;
+
+ int numGroups;
+
+ /**
+ * This contructor requires the quorum configuration
+ * to be declared in a separate file, and it takes the
+ * file as an input parameter.
+ */
+ public QuorumHierarchical(String filename)
+ throws ConfigException {
+ this.serverWeight = new HashMap<Long, Long>();
+ this.serverGroup = new HashMap<Long, Long>();
+ this.groupWeight = new HashMap<Long, Long>();
+ this.numGroups = 0;
+
+ readConfigFile(filename);
+ }
+
+ /**
+ * This constructor takes a set of properties. We use
+ * it in the unit test for this feature.
+ */
+
+ public QuorumHierarchical(Properties qp)
+ throws ConfigException {
+ this.serverWeight = new HashMap<Long, Long>();
+ this.serverGroup = new HashMap<Long, Long>();
+ this.groupWeight = new HashMap<Long, Long>();
+ this.numGroups = 0;
+
+ parse(qp);
+
+ LOG.info(serverWeight.size() + ", " + serverGroup.size() + ", " + groupWeight.size());
+ }
+
+ /**
+ * This contructor takes the two hash maps needed to enable
+ * validating quorums. We use it with QuorumPeerConfig. That is,
+ * we declare weights and groups in the server configuration
+ * file along with the other parameters.
+ */
+ public QuorumHierarchical(int numGroups,
+ HashMap<Long, Long> serverWeight,
+ HashMap<Long, Long> serverGroup)
+ {
+ this.serverWeight = serverWeight;
+ this.serverGroup = serverGroup;
+ this.groupWeight = new HashMap<Long, Long>();
+
+ computeGroupWeight();
+ this.numGroups = numGroups;
+ }
+
+
+ /**
+ * Reads a configration file. Called from the constructor
+ * that takes a file as an input.
+ */
+ private void readConfigFile(String filename)
+ throws ConfigException{
+ File configFile = new File(filename);
+
+ LOG.info("Reading configuration from: " + configFile);
+
+ try {
+ if (!configFile.exists()) {
+ throw new IllegalArgumentException(configFile.toString()
+ + " file is missing");
+ }
+
+ Properties cfg = new Properties();
+ FileInputStream in = new FileInputStream(configFile);
+ try {
+ cfg.load(in);
+ } finally {
+ in.close();
+ }
+
+ parse(cfg);
+ } catch (IOException e) {
+ throw new ConfigException("Error processing " + filename, e);
+ } catch (IllegalArgumentException e) {
+ throw new ConfigException("Error processing " + filename, e);
+ }
+
+ }
+
+
+ /**
+ * Parse properties if configuration given in a separate file.
+ */
+ private void parse(Properties quorumProp){
+ for (Entry<Object, Object> entry : quorumProp.entrySet()) {
+ String key = entry.getKey().toString();
+ String value = entry.getValue().toString();
+ if (key.startsWith("group")) {
+ int dot = key.indexOf('.');
+ long gid = Long.parseLong(key.substring(dot + 1));
+
+ numGroups++;
+
+ String parts[] = value.split(":");
+ for(String s : parts){
+ long sid = Long.parseLong(s);
+ serverGroup.put(sid, gid);
+ }
+
+
+ } else if(key.startsWith("weight")) {
+ int dot = key.indexOf('.');
+ long sid = Long.parseLong(key.substring(dot + 1));
+ serverWeight.put(sid, Long.parseLong(value));
+ }
+ }
+
+ computeGroupWeight();
+ }
+
+ /**
+ * This method pre-computes the weights of groups to speed up processing
+ * when validating a given set. We compute the weights of groups in
+ * different places, so we have a separate method.
+ */
+ private void computeGroupWeight(){
+ for(long sid : serverGroup.keySet()){
+ Long gid = serverGroup.get(sid);
+ if(!groupWeight.containsKey(gid))
+ groupWeight.put(gid, serverWeight.get(sid));
+ else {
+ long totalWeight = serverWeight.get(sid) + groupWeight.get(gid);
+ groupWeight.put(gid, totalWeight);
+ }
+
+ }
+ }
+
+ /**
+ * Verifies if a given set is a quorum.
+ */
+ public boolean containsQuorum(HashSet<Long> set){
+ HashMap<Long, Long> expansion = new HashMap<Long, Long>();
+
+ /*
+ * Adds up weights per group
+ */
+ if(set.size() == 0) return false;
+ else LOG.info("Set size: " + set.size());
+
+ for(long sid : set){
+ Long gid = serverGroup.get(sid);
+ if(!expansion.containsKey(gid))
+ expansion.put(gid, serverWeight.get(sid));
+ else {
+ long totalWeight = serverWeight.get(sid) + expansion.get(gid);
+ expansion.put(gid, totalWeight);
+ }
+ }
+
+ /*
+ * Check if all groups have majority
+ */
+ boolean majPerGroup = true;
+ for(long gid : expansion.keySet()) {
+ LOG.info("gid: " + expansion.get(gid));
+ if(expansion.get(gid) <= (groupWeight.get(gid) / 2) )
+ majPerGroup = false;
+ }
+
+ if((expansion.size() > (numGroups / 2)) && majPerGroup){
+ LOG.info("Positive set size: " + set.size());
+ return true;
+ }
+ else {
+ LOG.info("Negative set size: " + set.size());
+ return false;
+ }
+ }
+}
\ No newline at end of file
Added: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumMaj.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumMaj.java?rev=768390&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumMaj.java (added)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumMaj.java Fri Apr 24 18:26:08 2009
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server.quorum.flexible;
+
+import java.util.HashSet;
+
+
+/**
+ * This class implements a validator for majority quorums. The
+ * implementation is straightforward.
+ *
+ */
+public class QuorumMaj implements QuorumVerifier {
+ int half;
+
+ /**
+ * Defines a majority to avoid computing it every time.
+ */
+ public QuorumMaj(int n){
+ this.half = n/2;
+ }
+
+ /**
+ * Verifies if a set is a majority.
+ */
+ public boolean containsQuorum(HashSet<Long> set){
+ return (set.size() > half);
+ }
+
+}
\ No newline at end of file
Added: hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumVerifier.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumVerifier.java?rev=768390&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumVerifier.java (added)
+++ hadoop/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/flexible/QuorumVerifier.java Fri Apr 24 18:26:08 2009
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server.quorum.flexible;
+
+import java.util.HashSet;
+
+/**
+ * All quorum validators have to implement a method called
+ * containsQuorum, which verifies if a HashSet of server
+ * identifiers constitutes a quorum.
+ *
+ */
+
+public interface QuorumVerifier {
+ boolean containsQuorum(HashSet<Long> set);
+}
\ No newline at end of file
Added: hadoop/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/HierarchicalQuorumTest.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/HierarchicalQuorumTest.java?rev=768390&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/HierarchicalQuorumTest.java (added)
+++ hadoop/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/HierarchicalQuorumTest.java Fri Apr 24 18:26:08 2009
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.test;
+import java.io.IOException;
+import java.util.Properties;
+
+import java.io.File;
+import java.io.ByteArrayInputStream;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Random;
+
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.server.quorum.FastLeaderElection;
+import org.apache.zookeeper.server.quorum.QuorumPeer;
+import org.apache.zookeeper.server.quorum.Vote;
+import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
+import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
+import org.apache.zookeeper.server.quorum.flexible.QuorumHierarchical;
+
+import org.apache.log4j.Logger;
+import junit.framework.TestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+public class HierarchicalQuorumTest extends TestCase {
+ private static final Logger LOG = Logger.getLogger(HierarchicalQuorumTest.class);
+
+ class TestVote{
+ TestVote(int id, long leader){
+ this.leader = leader;
+ this.id = id;
+ }
+ long leader;
+ int id;
+ }
+
+ Properties qp;
+
+ int count;
+ int baseport;
+ int baseLEport;
+ HashMap<Long,QuorumServer> peers;
+ ArrayList<LEThread> threads;
+ File tmpdir[];
+ int port[];
+ Object finalObj;
+
+ volatile Vote votes[];
+ volatile boolean leaderDies;
+ volatile long leader = -1;
+ Random rand = new Random();
+
+
+ @Before
+ @Override
+ protected void setUp() throws Exception {
+ count = 9;
+ baseport= 33003;
+ baseLEport = 43003;
+
+ peers = new HashMap<Long,QuorumServer>(count);
+ threads = new ArrayList<LEThread>(count);
+ votes = new Vote[count];
+ tmpdir = new File[count];
+ port = new int[count];
+ finalObj = new Object();
+
+ String config = "group.1=0:1:2\n" +
+ "group.2=3:4:5\n" +
+ "group.3=6:7:8\n\n" +
+ "weight.0=1\n" +
+ "weight.1=1\n" +
+ "weight.2=1\n" +
+ "weight.3=1\n" +
+ "weight.4=1\n" +
+ "weight.5=1\n" +
+ "weight.6=1\n" +
+ "weight.7=1\n" +
+ "weight.8=1";
+
+ ByteArrayInputStream is = new ByteArrayInputStream(config.getBytes());
+ this.qp = new Properties();
+ qp.load(is);
+
+ LOG.info("SetUp " + getName());
+ }
+
+ protected void tearDown() throws Exception {
+ for(int i = 0; i < threads.size(); i++) {
+ ((FastLeaderElection) threads.get(i).peer.getElectionAlg()).shutdown();
+ }
+ LOG.info("FINISHED " + getName());
+ }
+
+ class LEThread extends Thread {
+ FastLeaderElection le;
+ int i;
+ QuorumPeer peer;
+ //int peerRound = 1;
+
+ LEThread(QuorumPeer peer, int i) {
+ this.i = i;
+ this.peer = peer;
+ LOG.info("Constructor: " + getName());
+ }
+
+ public void run() {
+ try {
+ Vote v = null;
+ while(true){
+
+ //while(true) {
+ peer.setPeerState(ServerState.LOOKING);
+ LOG.info("Going to call leader election.");
+ v = peer.getElectionAlg().lookForLeader();
+ if(v == null){
+ LOG.info("Thread " + i + " got a null vote");
+ return;
+ }
+
+ /*
+ * A real zookeeper would take care of setting the current vote. Here
+ * we do it manually.
+ */
+ peer.setCurrentVote(v);
+
+ LOG.info("Finished election: " + i + ", " + v.id);
+ votes[i] = v;
+
+ if((peer.getPeerState() == ServerState.FOLLOWING) ||
+ (peer.getPeerState() == ServerState.LEADING)) break;
+ }
+ LOG.debug("Thread " + i + " votes " + v);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ @Test
+ public void testHierarchicalQuorum() throws Exception {
+ FastLeaderElection le[] = new FastLeaderElection[count];
+
+ LOG.info("TestHierarchicalQuorum: " + getName()+ ", " + count);
+ for(int i = 0; i < count; i++) {
+ peers.put(Long.valueOf(i), new QuorumServer(i, new InetSocketAddress(baseport+100+i),
+ new InetSocketAddress(baseLEport+100+i)));
+ tmpdir[i] = File.createTempFile("letest", "test");
+ port[i] = baseport+i;
+ }
+
+ for(int i = 0; i < le.length; i++) {
+ QuorumHierarchical hq = new QuorumHierarchical(qp);
+ QuorumPeer peer = new QuorumPeer(peers, tmpdir[i], tmpdir[i], port[i], 3, i, 2, 2, 2, hq);
+ peer.startLeaderElection();
+ LEThread thread = new LEThread(peer, i);
+ thread.start();
+ threads.add(thread);
+ }
+ LOG.info("Started threads " + getName());
+
+ for(int i = 0; i < threads.size(); i++) {
+ threads.get(i).join(15000);
+ if (threads.get(i).isAlive()) {
+ fail("Threads didn't join");
+ }
+ }
+ }
+}
\ No newline at end of file
|