<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Daisy CMS &#38; Kauri</title>
	<atom:link href="http://brunodumon.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://brunodumon.wordpress.com</link>
	<description>Bruno's blog about the Daisy CMS and the Kauri webapp framework</description>
	<lastBuildDate>Wed, 23 Feb 2011 08:34:24 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='brunodumon.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Daisy CMS &#38; Kauri</title>
		<link>http://brunodumon.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://brunodumon.wordpress.com/osd.xml" title="Daisy CMS &#38; Kauri" />
	<atom:link rel='hub' href='http://brunodumon.wordpress.com/?pushpress=hub'/>
		<item>
		<title>HBase row locks</title>
		<link>http://brunodumon.wordpress.com/2010/05/04/hbase-row-locks/</link>
		<comments>http://brunodumon.wordpress.com/2010/05/04/hbase-row-locks/#comments</comments>
		<pubDate>Tue, 04 May 2010 15:16:39 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=156</guid>
		<description><![CDATA[I made a new blog post on the Outerthought blog about HBase row locks. All my future posts will be appearing over there, so be sure to check it out and update your feed subscriptions: http://outerthought.org/blog/<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=156&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I made a new blog post on the <a href="http://outerthought.org/blog/">Outerthought blog</a> about <a href="http://outerthought.org/blog/380-OTC.html">HBase row locks</a>.</p>
<p>All my future posts will be appearing over there, so be sure to check it out and update your feed subscriptions:</p>
<p><a href="http://outerthought.org/blog/">http://outerthought.org/blog/</a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/156/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/156/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/156/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=156&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2010/05/04/hbase-row-locks/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>HBase indexing library</title>
		<link>http://brunodumon.wordpress.com/2010/02/22/hbase-indexing-library/</link>
		<comments>http://brunodumon.wordpress.com/2010/02/22/hbase-indexing-library/#comments</comments>
		<pubDate>Mon, 22 Feb 2010 17:07:58 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=142</guid>
		<description><![CDATA[I have implemented a first iteration of the HBase-based indexing approach described in my previous blog entry. You can find it on the lilycms.org site: hbase indexing library.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=142&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I have implemented a first iteration of the HBase-based indexing approach described in <a href="http://brunodumon.wordpress.com/2010/02/17/building-indexes-using-hbase-mapping-strings-numbers-and-dates-onto-bytes/">my previous blog entry</a>.</p>
<p>You can find it on the lilycms.org site: <a href="http://lilycms.org/lily/index/362-OTC/361-OTC.html">hbase indexing library</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/142/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=142&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2010/02/22/hbase-indexing-library/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>Building indexes using HBase: mapping strings, numbers and dates onto bytes</title>
		<link>http://brunodumon.wordpress.com/2010/02/17/building-indexes-using-hbase-mapping-strings-numbers-and-dates-onto-bytes/</link>
		<comments>http://brunodumon.wordpress.com/2010/02/17/building-indexes-using-hbase-mapping-strings-numbers-and-dates-onto-bytes/#comments</comments>
		<pubDate>Wed, 17 Feb 2010 09:03:34 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=111</guid>
		<description><![CDATA[Note: a library implementing the ideas below is now available: hbase indexing library. I am looking into exploiting the sorted nature of HBase tables to build indexes. See this interesting presentation by Ryan Barrett (or these slides by Joe Gregorio) for how Google does the same for App Engine&#8217;s datastore on Bigtable. HBase identifies rows [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=111&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="padding-left:30px;">Note: a library implementing the ideas below is now available: <a href="http://lilycms.org/lily/roadmap/sketchbook/hbaseindexes.html">hbase indexing library</a>.</p>
<p>I am looking into exploiting the sorted nature of HBase tables to build indexes. See this interesting <a href="http://sites.google.com/site/io/under-the-covers-of-the-google-app-engine-datastore">presentation by Ryan Barrett</a> (or <a href="http://www.chariotsolutions.com/slides/pdfs/ete2009-GoogleUndertheCoversApp.pdf">these slides</a> by Joe Gregorio) for how Google does the same for App Engine&#8217;s datastore on Bigtable.</p>
<p>HBase identifies rows by a key, the key is a byte array, and it keeps the rows sorted on these byte[] keys. If we want to index more high-level data types like strings, number or dates, we will have to figure out how to map these onto bytes so that sort order is maintained as desired. This post is about exactly this problem, but first I will quickly go into the basics of building an index with HBase.</p>
<h2>Building an index with HBase</h2>
<p>HBase keeps rows sorted lexicographical by row key, and allows to do range scans (from-to) over these rows. Lexicographical sorting means sorting like in a dictionary, where the corresponding characters of two words (here bytes in byte arrays) are compared from left to right.</p>
<p>Suppose we have stored entities in HBase, one HBase-row per entity, and each of these entities has a property Country. Now we want to build a secondary index for these entities, in a different HBase table, to allow querying these entities by their Country property. The row keys in this index table would be composed of the Country property and the key of the row that uses this property value. In the example below, the target row key is shown as a number, but it can be any byte array.</p>
<table>
<tbody>
<tr>
<th>Country</th>
<th>Row</th>
</tr>
<tr>
<td>Belgium</td>
<td>1</td>
</tr>
<tr>
<td>Belgium</td>
<td>5</td>
</tr>
<tr>
<td>Brazil</td>
<td>7</td>
</tr>
<tr>
<td>France</td>
<td>4</td>
</tr>
<tr>
<td>France</td>
<td>12</td>
</tr>
</tbody>
</table>
<p>If we want to find all entities whose Country is France, we can use an HBase scanner to find all rows starting with France. Technically, you would set the start row for the scanner to France and stop the scanning by using a RowFilter with a BinaryPrefixComparator on the end value, here again France.</p>
<p>We are not limited to equals searches. We can also do range searches (e.g. from &#8216;Belgium&#8217; to &#8216;France&#8217;) or prefix searches (e.g. all entities whose country name starts with a B).</p>
<p>Note that we have no use for HBase columns here: all information we need is stored in the row key. The indexed row has to be part of the row key, otherwise we could have duplicate row keys which is not possible. Since HBase actually requires at least one column, we need to store a dummy column.</p>
<p>The same mechanism also allows to build composite indexes, where the row key is build up of multiple properties. Let&#8217;s extend our previous index with a new property Category.</p>
<table>
<tbody>
<tr>
<th>Country</th>
<th>Category</th>
<th>Row</th>
</tr>
<tr>
<td>Belgium</td>
<td>C</td>
<td>1</td>
</tr>
<tr>
<td>France</td>
<td>A</td>
<td>4</td>
</tr>
<tr>
<td>France</td>
<td>A</td>
<td>12</td>
</tr>
<tr>
<td>France</td>
<td>B</td>
<td>4</td>
</tr>
</tbody>
</table>
<p>The Category property is multi-valued, as exemplified by the entity 4 which occurs two times in the index.</p>
<p>We can again use HBase scanners to search on this index. With composite indexes, you do not necessarily need to search on each field, but you have to use the index-fields from left to right, and only the rightmost field can be used for range or prefix searching. In our example, you could equal-search on just country, equal-search on the combination of country and category, range/prefix scan on country, or equal-search on country in combination with a range/prefix scan on category.</p>
<p>When you do not search on all fields of a composite index, or when you do a range search and the field is multi-valued (like in the Category example), then the index can return the same row multiple times. In the example above, searching for France without a condition on Category would return row 4 two times, and not grouped together. This can be annoying for the consumer of the index results.</p>
<h3>Merge joins</h3>
<p>If we use indexes such that their returned row keys are unique (by searching on all fields of a composite index, or by not using range-scans for multi-valued properties) and sorted by row key (this last point is automatically assured by HBase), then we can easily merge-join results from multiple indexes! Merge-joins are nicely explained in <a href="http://www.youtube.com/watch?v=AgaL6NGpkB8#t=23m">this presentation</a> by Brett Slatkin.</p>
<h2>From the logical index model to bytes</h2>
<p>As mentioned before, HBase row keys are plain byte arrays, and HBase determines the row order by byte comparison. This means that to compare two byte arrays,  HBase compares the corresponding bytes, from left to right. All corresponding bytes being equal, shorter arrays compare as being smaller than larger arrays.</p>
<p><em>If we want to build indexes on data types such as strings, numbers or dates we will have to map them onto bytes in such a way that when HBase performs byte comparison, the order is maintained the same as when we would have compared the logical data types.</em></p>
<p>For composite indexes, we will need to pad the key entries with zeros so that corresponding values align in the byte arrays. So if the country field could be 10 bytes wide and the category field 3 bytes, index entries could look like:</p>
<pre>France0000A002
Belgium000C001</pre>
<p>The zeros represent bytes with all bits set to zero, so that in comparisons they will always be smaller than anything else.</p>
<p>Note that for strings, this means that you&#8217;ll have to decide on beforehand how long the indexed string value can get.</p>
<p><strong>Update:</strong> as Chris points out in the comments, this padding is not necessary, a well-chosen separator will also do the trick. After all, we only need to compare corresponding values if all the values more to the left are already equal.</p>
<h2>String sorting</h2>
<p>When we want to keep an index of string values, we need to convert those strings first to bytes to be able to store them in HBase row keys. If this conversion is done using UTF-8, then according to <a href="http://en.wikipedia.org/wiki/UTF8">Wikipedia</a>, &#8220;Sorting of UTF-8 strings as arrays of unsigned bytes will produce the same results as sorting them based on Unicode code points&#8221;.</p>
<p>Often we will not want our strings to be sorted by Unicode points, since we want to have for example é or E sorted before f. Note that this ordering is important because we want to do range scans, for equals searches only it does not matter what comes first.</p>
<p>Another aspect besides the order of the index is that, when searching on an index, one will often (but not always) want to ignore certain spelling variations like missing accents on characters.</p>
<h3>Ignoring case</h3>
<p>Ignoring the casing can be solved by translating all strings to their lowercase variant. Lowercasing a string in Java is a locale-dependent operation, though there are only a few locale&#8217;s for which this really makes a difference: Turkish, Azerbaijani and Lithuanian.</p>
<h3>Normalizing</h3>
<p>Sometimes there are multiple possible Unicode representations for the same visual character. A typical example is that characters with accents can be represented either as a single Unicode character or as the combination of the base character and a combining accent character. Java&#8217;s <a href="http://java.sun.com/javase/6/docs/api/java/text/Normalizer.html">Normalizer class</a> can canonicize these different forms.</p>
<h3>Simplify strings</h3>
<p>A possible solution for accented characters is to remove the accents, and more in general to reduce the text to plain ASCII. Lucene does this, we can re-use the code of their ASCIIFoldingFilter.</p>
<h3>Collator</h3>
<p>The standard Java solution for Locale-sensitive sorting is to use the <a href="http://java.sun.com/javase/6/docs/api/java/text/Collator.html">Collator class</a>. As far as sorting is concerned, it does exactly what we want. And it is even possible to materialize this ordening to something we can use as HBase row key: via the Collator you can get access to a <a href="http://java.sun.com/javase/6/docs/api/java/text/CollationKey.html">CollationKey</a>, which &#8220;converts a String to a series of bits that can be compared bitwise against other CollationKeys&#8221;. And you can get access to these bits using CollationKey&#8217;s toByteArray() method.</p>
<p>This sounds like an ideal solution, though there are some things to be aware of:</p>
<ul>
<li>The collation byte arrays are 	rather long: seems like it uses 6 to 8 bytes per character, plus 	some global overhead.</li>
<li>The inverse translation, from 	collation bytes to string is not supported. This is not really 	needed for our indexing purpose, but might be handy for debugging 	indexes.</li>
<li>The algorithm for the construction 	of the collation key bits is not specified as part of the API, so it 	might differ between JVM implementations or JVM versions.</li>
<li>While the Collator offers optimal 	sorting, it does not help if you want to search ignoring accents. 	But the reverse is true too: if you want to perform exact 	case-sensitive searches, while also having locale-sensitive and 	case-insensitive sorting, then the collator solution is perfect.</li>
<li>The collation key of a shorter string is not a prefix of the 	collation key of a longer string, so if you want to search on a 	prefix of the string, this is not possible. I find this an important 	disadvantage.</li>
</ul>
<h3>String sorting conclusion</h3>
<p>There is no obvious choice for a default fits-all solution, so for my HBase-indexing purpose I am looking into making the string to byte conversion pluggable.</p>
<h2>Integer sorting</h2>
<p>If we compare the byte representation of two integers, will this behave such that the smaller integer is considered smaller than the larger integer?</p>
<p>To know the answer to this question, we need to know that the binary representation of an integer is two&#8217;s complement.</p>
<p>Ignoring the two&#8217;s complement for a moment, in plain binary numbers, the more significant bits are more to the left, so HBase&#8217;s left-to-right comparison will automatically do the right thing.</p>
<p>The first bit of a two&#8217;s complement integer is a sign bit: 1 for negative numbers, 0 for positive numbers. 1 is larger than 0, so with byte-based comparison this would mean negative numbers are considered larger than positive numbers. This can be easily solved by flipping that bit.</p>
<p>In the negative number range, 1000 is smaller than 10, so bigger is smaller. However, because in two&#8217;s complements the bits of all negative numbers are inverted, the order will be fine.</p>
<p>So in conclusion, to make integers compare correctly, we only need to flip the sign bit.</p>
<h2>Float sorting</h2>
<p>Floats are a more interesting problem then integers. For one, floating point numbers are an approximate representation, so equals searches will be a problem. So let us assume that these indexes will be only used for range searches, possibly with a small epsilon range. In composite indexes, they will hence only be usable as the last field.</p>
<p>Java uses the float representation as defined by IEEE:</p>
<pre>[1 sign bit][8 exponent bits][23 mantissa bits]</pre>
<p>The sign bit is again 1 for negative numbers and 0 for positive numbers. The exponent and mantissa are such that the most significant bits are to the left. The exponent is unsigned. In the mantissa each place represents a negative power of 2: 2<sup>-1</sup>, 2<sup>-2</sup>, 2<sup>-3</sup>, &#8230; similar to the decimal system. See <a href="http://docs.sun.com/source/806-3568/ncg_math.html">this document</a> for more details on the float format.</p>
<p>With this representation positive floats will compare correctly. We only have to flip the sign bit so that positive numbers will be larger than negative numbers.</p>
<p>For negative numbers, in contrast with the two&#8217;s complement integers, the bits are not inverted. Simply flipping all the exponent and mantissa bits will get exactly the behavior we need.</p>
<p>Note that there is a lot more to say about floats: there is a way to encode positive and negative infinity, there is something called subnormal numbers, and there is a positive and negative zero. The float representation is designed such that all these will be sorted fine without further consideration. A special case is NaN, not a number, for which there are multiple representations possible, and these will be sorted before or after negative respectively positive infinity.</p>
<h2>Decimal sorting</h2>
<p>To sort BigDecimals, or also for floats, we can use the decimal string representation of the numbers. A simple example is:</p>
<pre>055.23
124.359</pre>
<p>Note that you have to pad your numbers with leading zeros in order to have them ordered correctly.</p>
<p>There are some difficulties with this approach though.</p>
<p>First, this will not work as-is for negative numbers: -1 is larger than -2. This can be solved by changing 1 to 9, 2 to 8, and so on. Then there is still the problem that 3.33 is larger than 3.333, but lexicographically the longer string will be sorted after the shorter one. This can be solved by suffixing each negative number with something that is larger than any digit, like the character &#8216;a&#8217;.</p>
<p>Second, you need to know on beforehand to know how large your numbers can get. And if they can get really large, you will end up with large strings. However, inspired by studying the floats encoding above, I think we can do a decimal equivalent of the floating point&#8217;s exponent-mantissa approach. With a bit of Googling, I found <a href="http://markmail.org/message/dnyfg6mmg7q4tkh4">this post</a> by Steven A Rowe which is about the same idea.</p>
<p><strong>Update:</strong></p>
<p>There is actually no need for a string-like approach for decimals, since HBase does not use strings anyway. I have probably been looking too much at Lucene lately.</p>
<p>As the BigDecimal javadoc says: &#8220;A BigDecimal consists of an arbitrary precision integer unscaled value and a 32-bit integer scale&#8221;. We can simply use this unscaled value as mantissa, it is available as (two&#8217;s complement) bytes via BigDecimal.unscaledValue().toByteArray(). The scale of a BigDecimal is the number of digits to the right of the decimal point. For the exponent, we are rather interested in the number of digits to the left, which can be computed via BigDecimal.precision() &#8211; BigDecimal.scale().</p>
<p>Given all this, we can build a byte array in the same way as for floats: a sign bit, some exponent bits, and a variable number of mantissa bits. As in IEEE floats, we can offset the exponent so that it becomes unsigned. For the rest, just invert bits with a similar reasoning as for floats.</p>
<h2>Date sorting</h2>
<p>The widely used <a href="http://en.wikipedia.org/wiki/ISO_8601">ISO 8601</a> date-time format is designed such that lexicographic order corresponds to the chronological order, so we can just use that format. It only uses plain ASCII characters, so the string-to-byte conversion does not pose a problem.</p>
<p>But again, there are some things to be aware of.</p>
<p>First, we should normalize all our date-times to the same timezone, preferably UTC, in which case the ISO 8601 string ends on a Z.</p>
<p>Second, the sorting will not be correct for negative years. This could be solved in a similar way as for decimals.</p>
<p><strong>Update:</strong></p>
<p>Dates can of course also be treated as integer/longs.</p>
<h2>Ascending and descending indexes</h2>
<p>By scanning over an index, the matching rows will be automatically returned in the order of the values we indexed on. HBase cannot scan in reverse order, so if you would like to be able to retrieve results in reverse order, you could do so by inverting all the bits in the index row keys.</p>
<h2>Status</h2>
<p>All the above is preliminary thought work, it might be full of errors and oversights. I hope to put it into practice sometime soon.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/111/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=111&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2010/02/17/building-indexes-using-hbase-mapping-strings-numbers-and-dates-onto-bytes/feed/</wfw:commentRss>
		<slash:comments>14</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>The Quote Based Guide to Hadoop Avro</title>
		<link>http://brunodumon.wordpress.com/2010/02/01/the-quote-based-guide-to-hadoop-avro/</link>
		<comments>http://brunodumon.wordpress.com/2010/02/01/the-quote-based-guide-to-hadoop-avro/#comments</comments>
		<pubDate>Mon, 01 Feb 2010 19:14:37 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=92</guid>
		<description><![CDATA[I have been taking a quick look at Avro. Avro is in the same category as Thrift, protocol buffers, or Etch. It provides efficient (de)serialization for a binary format and an RPC solution that uses this binary format. The formats as well as the RPC interfaces (&#8220;protocols&#8221;) are described using an IDL, allowing to support [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=92&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I have been taking a quick look at <a href="http://hadoop.apache.org/avro/">Avro</a>. Avro is in the same category as <a href="http://incubator.apache.org/thrift/">Thrift</a>, <a href="http://code.google.com/p/protobuf/">protocol buffers</a>, or <a href="http://cwiki.apache.org/ETCH/home.html">Etch</a>. It provides efficient (de)serialization for a binary format and an RPC solution that uses this binary format. The formats as well as the RPC interfaces (&#8220;protocols&#8221;) are described using an IDL, allowing to support different programming languages.</p>
<p><a href="http://brunodumon.files.wordpress.com/2010/02/avro-logo.png"><img class="aligncenter size-full wp-image-105" title="avro-logo" src="http://brunodumon.files.wordpress.com/2010/02/avro-logo.png?w=480" alt=""   /></a></p>
<p>The Avro website provides useful info, but I stumbled across the quotes below which I found helpful in better understanding what it is all about.</p>
<h3>Representations</h3>
<p><a href="http://markmail.org/message/apvw4g327snhoees">Doug Cutting</a>:</p>
<blockquote><p><em>Avro&#8217;s Java implementation currently includes three different data representations:</em></p>
<ul>
<li><em>a &#8220;generic&#8221; representation uses a standard set of datastructures for all datatypes: records are represented as Map&lt;String,Object&gt;, arrays as List&lt;Object&gt;, longs as Long, etc.</em></li>
<li><em>a &#8220;reflect&#8221; representation uses Java reflection to permit one to read and write existing Java classes with Avro.</em></li>
<li><em>a &#8220;specific&#8221; representation generates Java classes that are compiled and loaded, much like Thrift and Protocol Buffers.</em></li>
</ul>
<p><em>We don&#8217;t expect most scripting languages to use more than a single representation. Implementing Avro is quite simple, by design. We have a Python implementation, and hope to add more soon.</em></p></blockquote>
<p>As also mentioned in that thread, the reflect representation is less interesting from a performance point of view.</p>
<p>And as <a href="http://mail-archives.apache.org/mod_mbox/hadoop-avro-dev/200910.mbox/%3c4AE1F2E4.9030403@apache.org%3e">noted here</a>, <em>In Java, the Encoder/Decoder API also now double as an Event-based API</em>.</p>
<h3>The binary format</h3>
<p><a href="http://markmail.org/message/wj2esapipszic3wk">Doug Cutting</a>:</p>
<blockquote><p><em>A record&#8217;s fields are serialized in the order that the fields occur in the records schema, with no per-field annotations whatsoever. For example, a record that contains a string and an int is serialized simply as a string followed by an int, nothing before, nothing between and nothing after.</em></p></blockquote>
<h3>Standardizing a format rather than an API</h3>
<p><a href="http://markmail.org/message/wj2esapipszic3wk">Doug Cutting</a>:</p>
<blockquote><p><em>Thrift fundamentally standardizes an API, not a data format. Avro fundamentally is a data format specification, like XML.</em></p></blockquote>
<h3>Schema evolution &amp; skipping data while reading</h3>
<p><a href="http://markmail.org/message/5eh5k6unq4ms7wgw">Doug Cutting</a>:</p>
<blockquote><p><em>Avro supports schema evolution. In Avro, the schema used to write the data must be available when the data is read. (In files, it is typically stored in the file metadata.)</em></p>
<p><em>If you have the schema that was used to write the data, and you&#8217;re expecting a slightly different schema, then you simply keep those fields that are in both schemas and skip those not. This is equivalent to Thrift and Protocol Buffer&#8217;s support for schema evolution, but does not require manually assigning numeric field ids.</em></p>
<p><em>This feature can also be used to support projection. If you have records with many large fields, but only need a single field in a particular computation, then you can specify an expected schema with only that field, and the runtime will efficiently skip all of the other fields, returning a record with just the single, expected field.</em></p></blockquote>
<h3>The file format</h3>
<p>From the javadoc of DataFileWriter class:</p>
<blockquote><p><em>[DataFileWriter] Stores in a file a sequence of data conforming to a schema. The schema is stored in the file with the data. Each datum in a file is of the same schema. Data is written with a DatumWriter. Data is grouped into <em>blocks</em>. A synchronization marker is written between blocks, so that files may be split. Blocks may be compressed. Extensible metadata is stored at the end of the file. Files may be appended to.</em></p></blockquote>
<p>If you know a bit of Hadoop MapReduce, you will understand why these block-markers are interesting.</p>
<p>Besides the DataFileWriter, you can also use the DatumWriter/DatumReader classes for serializing just one record without the overhead of storing the schema.</p>
<h3>RPC socket server status</h3>
<p><a href="http://mail-archives.apache.org/mod_mbox/hadoop-avro-dev/200910.mbox/%3C4AE0981E.8090805@apache.org%3E">Doug Cutting</a>:</p>
<blockquote><p><em>Brian McCallister wrote:<br />
&gt; Is the socket server in avro intended as an example/dev option or a<br />
&gt; productiony server?</em></p>
<p><em>It&#8217;s more of an example.  The wire protocol is probably fine for production, but the existing client and server are pretty dumb.  There&#8217;s no connection pooling, async server-side i/o etc.</em></p>
<p><em>The Java HTTP-based version performs much better than the socket version in my micro-benchmarks.  It gets connection pooling for free, and it&#8217;s easy to configure it for an async server that can support more connections than threads. Today that&#8217;s the most production-worthy implementation.</em></p></blockquote>
<p>They are thinking about a standard <a href="http://issues.apache.org/jira/browse/AVRO-341">transport for Avro</a> though, and there is <a href="http://issues.apache.org/jira/browse/HADOOP-6419">some talk</a> about using <a href="http://www.jboss.org/netty">Netty</a>.</p>
<h3>Utf8 strings</h3>
<p>Something that you will notice soon enough when using the API or when using the SchemaCompiler to generate classes is that Avro does not use the String class to represent strings, but rather its own Utf8 class. I suppose this is for performance reasons: this class just stores the UTF-8 bytes without converting them to a String. It is also mutable, allowing to re-use the same instance.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/92/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=92&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2010/02/01/the-quote-based-guide-to-hadoop-avro/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>

		<media:content url="http://brunodumon.files.wordpress.com/2010/02/avro-logo.png" medium="image">
			<media:title type="html">avro-logo</media:title>
		</media:content>
	</item>
		<item>
		<title>Fosdem 2010</title>
		<link>http://brunodumon.wordpress.com/2010/01/31/fosdem-2010/</link>
		<comments>http://brunodumon.wordpress.com/2010/01/31/fosdem-2010/#comments</comments>
		<pubDate>Sun, 31 Jan 2010 17:18:12 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=89</guid>
		<description><![CDATA[Next weekend (6-7 februari) it is again Fosdem, the conference for open-source developers in Brussels. This year my Outerthought colleague Steven is organizing a devroom on NoSQL. And my colleague Evert will give a talk on building a CMS using NoSQL technologies. If you are interested in what our future content management solutions will be [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=89&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://www.fosdem.org"><img src="http://www.fosdem.org/promo/going-to" alt="I'm going to FOSDEM, the Free and Open Source Software Developers' European Meeting" /></a></p>
<p>Next weekend (6-7 februari) it is again <a href="http://fosdem.org/2010/">Fosdem</a>, the conference for open-source developers in <a href="http://maps.google.com/maps?ie=UTF8&amp;z=17&amp;ll=50.812375,4.380734&amp;spn=0.005369,0.011373&amp;om=1">Brussels</a>.</p>
<p>This year my <a href="http://www.outerthought.org/">Outerthought</a> colleague Steven is organizing a <a href="http://fosdem.org/2010/schedule/tracks/nosql">devroom on NoSQL</a>. And my colleague Evert will give a <a href="http://fosdem.org/2010/schedule/events/nosql_scalable_cms">talk</a> on building a CMS using NoSQL technologies. If you are interested in what our future content management solutions will be like, make sure to be there. Entrance is free, no registration is required.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/89/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/89/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/89/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=89&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2010/01/31/fosdem-2010/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>

		<media:content url="http://www.fosdem.org/promo/going-to" medium="image">
			<media:title type="html">I'm going to FOSDEM, the Free and Open Source Software Developers' European Meeting</media:title>
		</media:content>
	</item>
		<item>
		<title>Scalable databases and skewed access patterns</title>
		<link>http://brunodumon.wordpress.com/2009/10/21/scalable-databases-and-skewed-access-patterns/</link>
		<comments>http://brunodumon.wordpress.com/2009/10/21/scalable-databases-and-skewed-access-patterns/#comments</comments>
		<pubDate>Wed, 21 Oct 2009 13:14:05 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=84</guid>
		<description><![CDATA[One thing I found remarkable with scalable databases like Dynamo and BigTable is that while they spread the data over many servers, there is just one server responsible for handling requests to a certain record (= key). In the case of HBase, there is only one server for handling a specific tablet (= set of [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=84&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>One thing I found remarkable with scalable databases like Dynamo and BigTable is that while they spread the data over many servers, there is just one server responsible for handling requests to a certain record (= key). In the case of HBase, there is only one server for handling a specific tablet (= set of records) at a time. In the case of Dynamo, requests are handled by one coordinator but sent to all replicas, waiting until a certain number responded (= the quorum thing). So for reads of one specific record, there is no load balancing over multiple servers (AFAIU).</p>
<p>Now the Dynamo paper notes that &#8220;even where there is a significant skew in the access distribution there are enough keys in the popular end of the distribution so that the load of handling popular keys can be spread across the nodes uniformly through partitioning.&#8221;</p>
<p>The BigTable paper writes about a relatively small table (~500 GB) in Google Earth that &#8220;must serve tens of thousands of queries per second per datacenter with low latency. As a result, this table is hosted across hundreds of tablet servers and contains in-memory column families.&#8221;</p>
<p>Apparently in real-world scenarios things turn out OK. Still, what would happen when you have a very small data set accessed by a very large amount of users? Probably it would be a bit crazy to touch the database for every read, and this could be solved by caching in the application layer.</p>
<p>Applications where you have the same problem for writes seem much less common to me, and so a specific solution can be engineered for that.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/84/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=84&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2009/10/21/scalable-databases-and-skewed-access-patterns/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>A thought on client-server versus P2P</title>
		<link>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-client-server-versus-p2p/</link>
		<comments>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-client-server-versus-p2p/#comments</comments>
		<pubDate>Mon, 19 Oct 2009 18:10:09 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=78</guid>
		<description><![CDATA[Client-server and P2P (peer-to-peer) are two opposing architectures for distributed systems. However, in a client-server architecture, it is perfectly possible that the server is internally organized as a P2P system. This is the case for highly-available distributed storage systems like Dynamo and Cassandra.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=78&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Client-server and P2P (peer-to-peer) are two opposing architectures for distributed systems. However, in a client-server architecture, it is perfectly possible that the server is internally organized as a P2P system. This is the case for highly-available distributed storage systems like Dynamo and Cassandra.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/78/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/78/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/78/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=78&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-client-server-versus-p2p/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>A thought on the relation between RDMBSes and distributed stores</title>
		<link>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-the-relation-between-rdmbses-and-distributed-stores/</link>
		<comments>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-the-relation-between-rdmbses-and-distributed-stores/#comments</comments>
		<pubDate>Mon, 19 Oct 2009 18:06:49 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=77</guid>
		<description><![CDATA[As it is often misunderstood, I think it might make sense to review the relation of RDBMSes and NoSQL-style distibuted stores. Scaling the relational model across machine boundaries does not work very well, but in a distributed storage system scenario, the local storage can still be provided by an RDBMS. A highly available distributed store [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=77&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>As it is often misunderstood, I think it might make sense to review the relation of RDBMSes and NoSQL-style distibuted stores.</p>
<p>Scaling the relational model across machine boundaries does not work very well, but in a distributed storage system scenario, the local storage can still be provided by an RDBMS.</p>
<p>A highly available distributed store uses a set of nodes with a set of techniques (replication, consistent hashing, vector clocks, hash trees, hinted handoff, &#8230;). But at the node-local level, the data still needs to be stored somehow.</p>
<p>Some (Dynamo depending on config, PNUTS) store the data in an RDBMS (MySQL), which then serves as a btree-based key-value store. In this case, they are simply used because they exist, are ready-to-use and known-to-work.</p>
<p>Others (Cassandra, HBase/BigTable) use a simple better-performing solution based on SSTables (or similar), log files and memtables.</p>
<p>Cassandra and HBase are also examples of how it still makes sense to provide node-local, non-scalable finer-grained structures, namely the columns within column families. Similarly, within a distributed store the relational features could still be exploited at the local level.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/77/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/77/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/77/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=77&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2009/10/19/a-thought-on-the-relation-between-rdmbses-and-distributed-stores/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>Virtual nodes</title>
		<link>http://brunodumon.wordpress.com/2009/10/09/virtual-nodes/</link>
		<comments>http://brunodumon.wordpress.com/2009/10/09/virtual-nodes/#comments</comments>
		<pubDate>Fri, 09 Oct 2009 14:42:42 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=70</guid>
		<description><![CDATA[A recurring pattern noticed when partitioning (sharding) data over multiple systems is that of creating much more partitions than there are actual systems. Let&#8217;s call the systems nodes and the partitions virtual nodes. This is something which comes back in: Consistent hashing: each node gets assigned many of the hash buckets Katta (distributed Lucene): each [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=70&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>A recurring pattern noticed when partitioning (sharding) data over multiple systems is that of creating much more partitions than there are actual systems.</p>
<p>Let&#8217;s call the systems <em>nodes</em> and the partitions <em>virtual nodes</em>.</p>
<p>This is something which comes back in:</p>
<ul>
<li> <a href="http://www.akamai.com/dl/technical_publications/ConsistenHashingandRandomTreesDistributedCachingprotocolsforrelievingHotSpotsontheworldwideweb.pdf">Consistent hashing</a>: each node gets assigned many of the hash buckets</li>
<li> <a href="http://katta.sourceforge.net/">Katta</a> (distributed Lucene): each search server can handle a variety of index shards</li>
<li> <a href="http://labs.google.com/papers/bigtable.html">Bigtable</a> or <a href="http://hadoop.apache.org/hbase/">HBase</a>: each region server can handle a variety of regions</li>
</ul>
<p>In these examples, the hash bucket, the index shard and the region are the virtual nodes.</p>
<p>The advantages include:</p>
<ul>
<li> being able to handle nodes with different capabilities (a heterogenous set of nodes): more powerful nodes get assigned more of the virtual nodes</li>
<li>when a node goes down:
<ul>
<li> the virtual nodes that it was responsible for can be re-assigned to multiple other nodes</li>
<li> or from another point of view, the replica&#8217;s of the virtual nodes managed by this node will be on a variety of other nodes, so the load of the killed node will be automatically spread over multiple other nodes</li>
</ul>
</li>
<li>when a new node is added, it can take over the responsibility for some of the virtual nodes, and this can be done one by one to allow for warm up</li>
</ul>
<p>Speaking of recurring patterns, there is also clear similarity in the way Lucene handles index updates and the way HBase &amp; Bigtable handle updates:</p>
<ul>
<li>Lucene first buffers index updates in memory, and when certain limits are reached, flushed them to a new file on disk called an index segment. This is instead of trying to update the existing index in-place. Searching is done by searching over all index segments. When the number of segments gets large, some of them are merged together in the background.</li>
<li>HBase/Bigtable does something similar with MapFiles/SSTables: new data is occasionally flushed to a new immutable MapFile (basically a file of key-value pairs sorted by key with an additional index containing the positions in the file of a subset of the keys, to avoid a full sequential scan). When looking for something, all flushed MapFiles are consulted until the item is found. A background process merges the MapFiles when there are too many of them.</li>
</ul>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/70/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/70/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/70/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=70&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2009/10/09/virtual-nodes/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
		<item>
		<title>Memory, disk &amp; network speed</title>
		<link>http://brunodumon.wordpress.com/2009/07/17/memory-disk-network-speed/</link>
		<comments>http://brunodumon.wordpress.com/2009/07/17/memory-disk-network-speed/#comments</comments>
		<pubDate>Fri, 17 Jul 2009 11:42:24 +0000</pubDate>
		<dc:creator>Bruno Dumon</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://brunodumon.wordpress.com/?p=66</guid>
		<description><![CDATA[This acmqueue article by Adam Jacobs contains some interesting thoughts: in modern systems, [...] random access to memory is typically slower than sequential access to disk. Note that random reads from disk are more than 150,000 times slower than sequential access; SSD improves on this ratio by less than one order of magnitude. [...] the [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=66&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://queue.acm.org/detail.cfm?id=1563874">This acmqueue article</a> by <span><span>Adam Jacobs c</span></span>ontains some interesting thoughts:</p>
<ul>
<li>in modern systems, [...] <strong>random access to memory is typically slower than sequential access to disk</strong>. Note that random reads from disk are more than 150,000 times slower than sequential access; SSD improves on this ratio by less than one order of magnitude.</li>
<li>[...] the highest-speed local network technologies have now surpassed most locally attached disk systems with respect to bandwidth, and network latency is naturally much lower than disk latency. As a result, <strong>the performance cost of storing and retrieving data on other nodes in a network is comparable to</strong> (and in the case of random access, potentially far less than) <strong>the cost of using disk</strong>. Once a large dataset has been distributed to multiple nodes in this way, however, a huge advantage can be obtained by distributing the <em>processing</em> as well—so long as the analysis is amenable to parallel processing.</li>
</ul>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/brunodumon.wordpress.com/66/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/brunodumon.wordpress.com/66/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/brunodumon.wordpress.com/66/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=brunodumon.wordpress.com&amp;blog=1315769&amp;post=66&amp;subd=brunodumon&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://brunodumon.wordpress.com/2009/07/17/memory-disk-network-speed/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/45f0d1b499d3bf722e619b11b3e8f589?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">brunodumon</media:title>
		</media:content>
	</item>
	</channel>
</rss>
