Recent Papers by Mihai Budiu

@TechReport{budiu-tr13,
   author =	{Mihai Budiu and Gordon Plotkin and Yuan Yu and Li Zhang},
   title =	{Unified Query Processing for JSON Documents and Indexes},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2013-122},
   month =	{December},
   year =	{2013},
   abstract =	{We present JPath, a JSON database query language, and its syntax,
                 semantics, and implementation. We introduce an indexing data structure
                 for answering JPath queries, and provide a theory unifying query
                 execution on data and index trees using operations on matrices with
                 lattice-valued elements.},
   url =	{http://research.microsoft.com/apps/pubs/?id=205468}
}

@InProceedings{budiu-esop13,
   author =	{Mihai Budiu and Joel Galenson and Gordon Plotkin},
   title =	{The Compiler Forest},
   booktitle =	{European Symposium on Programming (ESOP)},
   volume =	{LNCS 7792},
   pages =	{20},
   address =	{Rome, Italy},
   month =	{March 16-24},
   year =	{2013},
   abstract =	{We address the problem of writing compilers targeting \emph{complex
                 execution environments}, such as computer clusters composed of machines
                 with multi-core CPUs. To that end we introduce \emph{partial
                 compilers}. These compilers can pass sub-programs to several child
                 (partial) compilers, combining the code generated by their children to
                 generate the final target code. \par We define a set of high-level
                 polymorphic operations manipulating both compilers and partial
                 compilers as first-class values. These mechanisms provide a software
                 architecture for modular compiler construction. This allows the
                 building of a forest of compilers, providing a structured treatment of
                 multistage compilers.},
   url =	{http://budiu.info/work/esop13.pdf},
   editors =	{Matthias Felleisen and Philippa Gardner},
   confweb =	{http://www.ccs.neu.edu/esop2013/}
}

@article{erlingsson-tocs12,
   author =	{\'{U}lfar Erlingsson and Marcus Peinado and Simon Peter and Mihai Budiu and Gloria Mainar-Ruiz},
   title =	{{Fay}: Extensible Distributed Tracing from Kernels to Clusters},
   journal =	{Transactions on Computer Systems (TOCS)},
   volume =	{30},
   number =	{4},
   month =	{November},
   year =	{2012},
   abstract =	{Fay is a flexible platform for the efficient collection, processing,
                 and analysis of software execution traces. Fay provides dynamic tracing
                 through use of runtime instrumentation and distributed aggregation
                 within machines and across clusters. At the lowest level, Fay can be
                 safely extended with new tracing primitives, including even untrusted,
                 fully-optimized machine code, and Fay can be applied to running
                 user-mode or kernel-mode software without compromising system
                 stability. At the highest level, Fay provides a unified, declarative
                 means of specifying what events to trace, as well as the aggregation,
                 processing, and analysis of those events. \par We have implemented the
                 Fay tracing platform for Windows and integrated it with two powerful,
                 expressive systems for distributed programming. Our implementation is
                 easy to use, can be applied to unmodified production systems, and
                 provides primitives that allow the overhead of tracing to be greatly
                 reduced, compared to previous dynamic tracing platforms. To show the
                 generality of Fay tracing, we reimplement, in experiments, a range of
                 tracing strategies and several custom mechanisms from existing tracing
                 frameworks. \par Fay shows that modern techniques for high-level
                 querying and data-parallel processing of disagreggated data streams are
                 well suited to comprehensive monitoring of software execution in
                 distributed systems. Revisiting a lesson from the late 1960s [Deutsch
                 and Grant 1971], Fay also demonstrates the efficiency and extensibility
                 benefits of using safe, staticallyverified machine code as the basis
                 for low-level execution tracing. Finally, Fay establishes that, by
                 automatically deriving optimized query plans and code for safe
                 extensions, the expressiveness and performance of high-level tracing
                 queries can equal or even surpass that of specialized monitoring tools.},
   note =	{An expanded version of the SOSP 2011 paper},
   url =	{http://budiu.info/work/fay-tocs12.pdf},
   doi =	{http://dx.doi.org/10.1145/2382553.2382555},
   confweb =	{http://tocs.acm.org}
}

@Misc{budiu-hpdc12,
   author =	{Mihai Budiu},
   title =	{Putting A ``Big-Data'' Platform to Good Use: Training Kinect},
   address =	{Delft, Netherlands},
   month =	{June 20},
   year =	{2012},
   note =	{Keynote to the 21st International Symposium on High-Performance
                 Parallel and Distributed Computing (HPDC)},
   confweb =	{http://www.hpdc.org/2012},
   howpublished =	{http://budiu.info/work/hpdc12.pdf}
}

@InProceedings{budiu-biglearn11,
   author =	{Mihai Budiu and Jamie Shotton and Derek G. Murray and Mark Finocchio},
   title =	{Parallelizing the Training of the Kinect Body Parts Labeling Algorithm},
   booktitle =	{Big Learning: Algorithms, Systems and Tools for Learning at Scale},
   address =	{Sierra Nevada, Spain},
   month =	{December 16-17},
   year =	{2011},
   abstract =	{We present the parallelized implementation of decision forest training
                 as used in Kinect to train the body parts classification system. We
                 describe the practical details of dealing with large training sets and
                 deep trees, and describe how to parallelize over multiple dimensions of
                 the problem.},
   url =	{http://budiu.info/work/budiu-biglearn11.pdf},
   confweb =	{http://biglearn.org/}
}

@InProceedings{erlingsson-sosp11,
   author =	{\'{U}lfar Erlingsson and Marcus Peinado and Simon Peter and Mihai Budiu},
   title =	{{Fay}: Extensible Distributed Tracing from Kernels to Clusters},
   booktitle =	{ACM Symposium on Operating Systems Principles (SOSP)},
   address =	{Cascais, Portugal},
   month =	{October 23-26},
   year =	{2011},
   abstract =	{Fay is a flexible platform for the efficient collection, processing,
                 and analysis of software execution traces. Fay provides dynamic tracing
                 through use of runtime instrumentation and distributed aggregation
                 within machines and across clusters. At the lowest level, Fay can be
                 safely extended with new tracing primitives, including even untrusted,
                 fully-optimized machine code, and Fay can be applied to running
                 user-mode or kernel-mode software without compromising system
                 stability. At the highest level, Fay provides a unified, declarative
                 means of specifying what events to trace, as well as the aggregation,
                 processing, and analysis of those events. \par We have implemented the
                 Fay tracing platform for Windows and integrated it with two powerful,
                 expressive systems for distributed programming. Our implementation is
                 easy to use, can be applied to unmodified production systems, and
                 provides primitives that allow the overhead of tracing to be greatly
                 reduced, compared to previous dynamic tracing platforms. To show the
                 generality of Fay tracing, we reimplement, in experiments, a range of
                 tracing strategies and several custom mechanisms from existing tracing
                 frameworks. \par Fay shows that modern techniques for high-level
                 querying and data-parallel processing of disagreggated data streams are
                 well suited to comprehensive monitoring of software execution in
                 distributed systems. Revisiting a lesson from the late 1960s [15], Fay
                 also demonstrates the efficiency and extensibility benefits of using
                 safe, statically-verified machine code as the basis for low-level
                 execution tracing. Finally, Fay establishes that, by automatically
                 deriving optimized query plans and code for safe extensions, the
                 expressiveness and performance of high-level tracing queries can equal
                 or even surpass that of specialized monitoring tools.},
   url =	{http://budiu.info/work/fay-sosp11.pdf},
   confweb =	{http://sosp2011.gsd.inesc-id.pt}
}

@InProceedings{gonina-mapreduce11,
   author =	{Ekaterina Gonina and Anitha Kannan and John Shafer and Mihai Budiu},
   title =	{Parallelizing large-scale data processing applications with data skew: a case study in product-offer matching},
   booktitle =	{International Workshop on MapReduce and its Applications (MAPREDUCE)},
   address =	{San Jose, CA},
   month =	{June 8},
   year =	{2011},
   abstract =	{The last decade has seen a surge of interest in large-scale
                 data-parallel processing engines. While these engines share many
                 features in common with parallel databases, they make a set of dierent
                 trade-offs. In consequence many of the lessons learned for programming
                 parallel databases have to be re-learned in the new environment. In
                 this paper we show a case study of parallelizing an example large-scale
                 application (offer matching, a core part of online shopping) on an
                 example MapReduce-based distributed computation engine (DryadLINQ). We
                 focus on the challenges raised by the nature of large data sets and
                 data skew and show how they can be addressed eectively within this
                 computation framework by optimizing the computation to adapt to the
                 nature of the data. In particular we describe three different
                 strategies for performing distributed joins and show how the platform
                 language allows us to implement optimization strategies at the
                 application level, without system support. We show that this
                 extensibility in the programming model allows for a highly highly
                 effective system, providing a measured speedup of more than 100 on 64
                 machines (256 cores), and an estimated speedup of 200 on 1280 machines
                 (5120 cores) when matching 4 million offers.},
   url =	{http://budiu.info/work/gonina-mr11.pdf},
   confweb =	{http://graal.ens-lyon.fr/mapreduce}
}

@InProceedings{jagannath-hips11,
   author =	{Vilas Jagannath and Zuoning Yin and Mihai Budiu},
   title =	{Monitoring and Debugging DryadLINQ Applications with Daphne},
   booktitle =	{International Workshop on High-Level Parallel Programming Models and
                 Supportive Environments (HIPS)},
   address =	{Anchorage, AK},
   month =	{May 20},
   year =	{2011},
   abstract =	{Debugging and optimizing large-scale applications is still more art
                 than engineering discipline. This document describes our experience in
                 building a set of tools to help DryadLINQ application developers
                 understand and debug their programs. \par The core infrastructure for
                 our tools is a portable library which provides a DryadLINQ job object
                 model (i.e., a local representation of the distributed state of an
                 executed application). Layered on the job object model we have built a
                 variety of interactive and batch tools for: performance data collection
                 and analysis, distributed state visualization, failure diagnostics,
                 debugging, and profiling.},
   url =	{http://budiu.info/work/jagannath-hips11.pdf},
   confweb =	{http://www.unixer.de/hips2011}
}

@InBook{mcsherry-chapter11,
   author =	{Frank McSherry and Yuan Yu and Mihai Budiu and Michael Isard and Dennis Fetterly},
   editor =	{Ron Bekkerman, Misha Bilenko and John Langford},
   title =	{Scaling Up Machine Learning},
   chapter =	{Large-Scale Machine Learning using {DryadLINQ}},
   publisher =	{Cambridge University Press},
   year =	{2011},
   abstract =	{This chapter describes DryadLINQ, a general-purpose system for large
                 scale data-parallel computing, and illustrates its use on a number of
                 machine learning problems.},
   url =	{http://budiu.info/work/dryad-ml-book-chapter.pdf},
   confweb =	{http://www.cambridge.org/us/knowledge/isbn/item6542017/?site_locale=en_US}
}

@InProceedings{budiu-ipdps11,
   author =	{Mihai Budiu and Daniel Delling and Renato Werneck},
   title =	{{DryadOpt}: Branch-and-Bound on Distributed Data-Parallel Execution Engines},
   booktitle =	{IEEE International Parallel and Distributed Processing Symposium
                 (IPDPS)},
   address =	{Anchorage, AK},
   month =	{May 16-20},
   year =	{2011},
   abstract =	{We introduce DryadOpt, a library that enables massively parallel and
                 distributed execution of optimization algorithms for solving hard
                 problems. DryadOpt performs an exhaustive search of the solution space
                 using branch-and-bound, by recursively splitting the original problem
                 into many simpler subproblems. DryadOpt uses both parallelism (at the
                 core level) and distributed execution (at the machine level). \par
                 DryadOpt provides to the users a simple yet powerful interface: the
                 user of the library only needs to implement sequential code to process
                 individual subproblems (either by solving them in full or generating
                 new subproblems). The parallelism and distribution are handled
                 automatically by the DryadOpt library, and are invisible to the user.
                 Our system is is implemented on top of DryadLINQ --- a distributed
                 data-parallel execution engine similar to Hadoop and Map-Reduce.
                 Despite the fact that these engines offer a constrained application
                 model, with restricted communication patterns, our experiments show
                 that careful design choices allow DryadOpt to scale linearly with the
                 number of machines, with very little overhead.},
   url =	{http://budiu.info/work/ipdps11.pdf},
   confweb =	{http://www.ipdps.org/ipdps2011/2011_cfp.html}
}

@TechReport{budiu-tr10,
   author =	{Mihai Budiu},
   title =	{User interfaces for exploring multi-dimensional data sets},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2010-67},
   month =	{June},
   year =	{2010},
   abstract =	{Visualization is a crucial part of data manipulation. However,
                 exploring interactively complex visualizations of large amounts of
                 information is not easy. To navigate through highly dimensional data
                 the user needs easily used tools to discover patterns and correlations.
                 We are proposing several simple and intuitive user-interface elements
                 based on drag-and-drop to help the user discover correlations in
                 multi-dimensional data sets. The UI allows the user to (a) assign
                 colors to entities according to values of their attributes, (b)
                 drag-and-drop colors between different views displaying entity
                 attributes and (c) drag-and-drop data selections between views. We
                 exploit functional relationships in the underlying data model to
                 transport colors between views in a sound way.},
   url =	{http://research.microsoft.com/apps/pubs/?id=132078}
}

@Article{abadi-tissec09,
   author =	{Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti},
   title =	{Control-Flow Integrity principles, implementations and applications},
   journal =	{ACM Transactions on Information and System Security (TISSEC)},
   volume =	{13},
   number =	{1},
   pages =	{1--40},
   year =	{2009},
   abstract =	{Current software attacks often build on exploits that subvert
                 machine-code execution. The enforcement of a basic safety property,
                 control-flow integrity (CFI), can prevent such attacks from arbitrarily
                 controlling program behavior. CFI enforcement is simple and its
                 guarantees can be established formally, even with respect to powerful
                 adversaries. Moreover, CFI enforcement is practical: It is compatible
                 with existing software and can be done efficiently using software
                 rewriting in commodity systems. Finally, CFI provides a useful
                 foundation for enforcing further security policies, as we demonstrate
                 with efficient software implementations of a protected shadow call
                 stack and of access control for memory regions.},
   url =	{http://portal.acm.org/citation.cfm?id=1609960&jmp=abstract&coll=portal&dl=ACM&CFID=1071288&CFTOKEN=87398879#},
   doi =	{http://doi.acm.org/10.1145/1609956.1609960},
   confweb =	{http://www.acm.org/tissec/}
}

@InProceedings{goldszmidt-ladis09,
   author =	{Moises Goldszmidt and Mihai Budiu and Yue zhang and Michael Pechuk},
   title =	{Towards Automatic Policy Refinement in Repair Services for Large Distributed Systems},
   booktitle =	{Large Scale Distributed Systems and Middleware (LADIS)},
   pages =	{5},
   address =	{Big Sky Resort, Big Sky, Montana},
   month =	{October 10-11},
   year =	{2009},
   note =	{Also published in ACM SIGOPS Operating Systems Review vol 44 no 2,
                 2010, pp 47-51.},
   url =	{http://budiu.info/work/goldszmidt-ladis09.pdf},
   confweb =	{http://www.cs.cornell.edu/projects/ladis2009/program.htm}
}

@InProceedings{kannan-socc09,
   author =	{Hari Kannan and Mihai Budiu and John D. Davis and Girish Venkataramani},
   title =	{Tuning {SoCs} using the Dynamic Critical Path},
   booktitle =	{IEEE International SOC Conference},
   address =	{Belfast, Northern Ireland},
   month =	{September 9-11},
   year =	{2009},
   abstract =	{We propose using a profiling-based technique (Dynamic Critical Path)
                 to guide SoC optimization. Optimizing SoCs composed of many modules
                 involves exploring a large space of possible configurations
                 (exponential in the number of component modules). We present this
                 optimization technique applied to a Globally Asynchronous Locally
                 Synchronous (GALS) RTL design. Furthermore, we investigate the loss of
                 precision when abstract versions of hardware modules are used for the
                 critical path computation. Using the critical path provides very fast
                 convergence towards optimal or near-optimal solutions when analyzing
                 large configuration spaces by optimizing the design for composite
                 optimization metrics, such as energy-delay.},
   note =	{Also as Microsoft Research Technical Report MSR-TR-2009-44},
   url =	{http://budiu.info/work/socc09.pdf},
   confweb =	{http://www.ieee-socc.org/SOCC2009/Authors__Page/SoCC2009-CFP.pdf}
}

@InProceedings{popa-hotcloud09,
   author =	{Lucian Popa and Mihai Budiu and Yuan Yu and Michael Isard},
   title =	{{DryadInc}: Reusing work in large-scale computations},
   booktitle =	{Workshop on Hot Topics in Cloud Computing (HotCloud)},
   address =	{San Diego, CA},
   month =	{June 15},
   year =	{2009},
   abstract =	{Many large-scale (cloud) computations operate on append-only,
                 partitioned datasets. We present two incremental computation frameworks
                 to reuse prior work in these circumstances: (1) reusing identical
                 computations already performed on data partitions, and (2) computing
                 just on the newly appended data and merging the new and previous
                 results.},
   url =	{http://budiu.info/work/hotcloud09.pdf},
   confweb =	{http://www.usenix.org/events/hotcloud09/index.html}
}

@TechReport{kannan-tr09,
   author =	{Hari Kannan and Mihai Budiu and John D. Davis and Girish Venkataramani},
   title =	{Tuning {SoCs} using the Dynamic Critical Path},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2009-44},
   month =	{April},
   year =	{2009},
   abstract =	{We propose using the Global Dynamic Critical Path to diagnose
                 system-wide bottlenecks using representative benchmarks to direct
                 embedded SoC optimizations and provide real-world experience of
                 implementing the global critical path (GCP) analysis framework on a
                 Globally-Asynchronous Locally-Synchronous (GALS) SoC built around the
                 LEON3 CPU. We perform our analysis at the RTL level and extend our
                 evaluation to abstract RTL models. We use the power-delay product as
                 the example cost function for optimization; we can adjust the
                 power-delay by tuning the frequency of the clock domains of each SoC IP
                 block. We show that the GCP optimization framework can accommodate
                 other cost functions as well, while effectively directing SoC
                 optimization efforts. Our case studies demonstrate that the GCP
                 algorithm can converge quickly to solutions even in the very large
                 (exponential) search spaces describing permissible SoC configurations,
                 with no designer intervention (for instance, we find the solution of a
                 6-dimensional space with 19000 configurations in 11 steps). Even though
                 our initial implementation relies on manual source code
                 instrumentation, we only add 1\% extra lines of code to the design.
                 This represents annotating less than 0.2\% of the ports of the overall
                 Multi-processor SoC design.},
   url =	{http://research.microsoft.com/apps/pubs/?id=80367}
}

@InProceedings{cretu-wasl08,
   author =	{Gabriela F. Cre{\c t}u-Cioc{\^ a}rlie and Mihai Budiu and Moises Goldszmidt},
   title =	{Hunting for problems with {Artemis}},
   booktitle =	{USENIX Workshop on the Analysis of System Logs (WASL)},
   address =	{San Diego, CA},
   month =	{December 7},
   year =	{2008},
   abstract =	{Artemis is a modular application designed for analyzing and
                 troubleshooting the performance of large clusters running datacenter
                 services. Artemis is composed of four modules: (1) distributed log
                 collection and data extraction, (2) a database storing the extracted
                 data, (3) an interactive visualization tool for exploring the data, and
                 (4) a plug-in interface (and a set of sample plug-ins) allowing users
                 to implement data analysis tools including (a) the extraction and
                 construction of new features from the basic measurements collected, and
                 (b) the implementation and invocation of statistical and machine
                 learning algorithms and tools. In this paper we describe each of these
                 components and then we illustrate the power of the plug-in architecture
                 by presenting a case-study using Artemis to analyze a Dryad application
                 running on a 240-machine cluster.},
   url =	{http://budiu.info/work/wasl08.pdf},
   confweb =	{http://www.usenix.org/events/wasl08/}
}

@InProceedings{yu-osdi08,
   author =	{Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and {\'U}lfar Erlingsson and Pradeep Kumar Gunda and Jon Currey},
   title =	{{DryadLINQ}: A System for General-Purpose Distributed Data-Parallel Computing Using a High-Level Language},
   booktitle =	{Symposium on Operating System Design and Implementation (OSDI)},
   pages =	{14},
   address =	{San Diego, CA},
   month =	{December 8-10},
   year =	{2008},
   abstract =	{DryadLINQ is a system and a set of language extensions that enable a
                 new programming model for large scale distributed computing. It
                 generalizes previous execution environments such as SQL, MapReduce, and
                 Dryad in two ways: by adopting an expressive data model of strongly
                 typed .NET objects; and by supporting general-purpose imperative and
                 declarative operations on datasets within a traditional high-level
                 programming language. A DryadLINQ program is a sequential program
                 composed of LINQ expressions performing arbitrary sideeffect- free
                 transformations on datasets, and can be written and debugged using
                 standard .NET development tools. The DryadLINQ system automatically and
                 transparently translates the data-parallel portions of the program into
                 a distributed execution plan which is passed to the Dryad execution
                 platform. Dryad, which has been in continuous operation for several
                 years on production clusters made up of thousands of computers, ensures
                 efficient, reliable execution of this plan. We describe the
                 implementation of the DryadLINQ compiler and runtime. We evaluate
                 DryadLINQ on a varied set of programs drawn from domains such as
                 web-graph analysis, large-scale log mining, and machine learning. We
                 show that excellent absolute performance can be attained --- a
                 general-purpose sort of 10^{12} Bytes of data executes in 319 seconds
                 on a 240-computer, 960- disk cluster --- as well as demonstrating
                 near-linear scaling of execution time on representative applications as
                 we vary the number of computers used for a job.},
   url =	{http://budiu.info/work/DryadLINQ.pdf},
   confweb =	{http://www.usenix.org/event/osdi08/},
   acceptancerate =	{26/193=13\%}
}

@TechReport{yu-tr08,
   author =	{Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and Ulfar Erlingsson and Pradeep Kumar Gunda and Jon Currey and Frank McSherry and Kannan Achan},
   title =	{Some sample programs written in {DryadLINQ}},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2008-74},
   pages =	{37},
   month = may,
   year =	{2008},
   abstract =	{DryadLINQ is a system and a set of language extensions that enable a
                 new programming model for large scale distributed computing. This
                 technical report contains annotated listings of several example
                 programs written using DryadLINQ, illustrating typical usage.},
   url =	{http://research.microsoft.com/research/pubs/view.aspx?msr_tr_id=MSR-TR-2008-74}
}

@inproceedings{venkataramani-dac07,
   author =	{Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein},
   title =	{Critical Path: A Tool for System-Level Timing Analysis},
   booktitle =	{Design Automation Conference (DAC)},
   address =	{San Diego, CA},
   month =	{June 4--8},
   year =	{2007},
   abstract =	{An effective method for focusing optimization effort on the most
                 important parts of a design is to examine those elements on the
                 critical path. Traditionally, the critical path is defined at the RTL
                 level, as the longest path in the combinational logic between clocked
                 registers. In this paper, we present a system-level timing analysis
                 technique to define the concept of a Global Critical Path (GCP), for
                 predicting system-level performance. We show how the GCP can be used as
                 a theoretical and practical tool for understanding, summarizing and
                 optimizing the behavior of highly concurrent self-timed circuits. We
                 formally define the GCP and show how it can be constructed using a
                 discrete event model and hardware profiling techniques. The GCP
                 provides valuable insight into the control-path behavior of circuits
                 and in finding system-level bottlenecks. We have incorporated the GCP
                 construction and analysis framework into a high-level synthesis and
                 simulation toolchain, thus enabling complete automation in modeling,
                 analysis and optimization.},
   note =	{An expanded version is in CMU-CS-06-144},
   url =	{http://budiu.info/work/dac07.pdf},
   acceptancerate =	{161/713 = 22\%},
   confweb =	{http://www.dac.com/44th/index.html}
}

@inproceedings{isard-eurosys07,
   author =	{Michael Isard and Mihai Budiu and Yuan Yu and Andrew Birrell and Dennis Fetterly},
   title =	{{Dryad}: Distributed Data-Parallel Programs from Sequential Building Blocks},
   booktitle =	{European Conference on Computer Systems (EuroSys)},
   pages =	{59--72},
   address =	{Lisbon, Portugal},
   month =	{March 21-23},
   year =	{2007},
   abstract =	{Dryad is a general-purpose distributed execution engine for
                 coarse-grain data-parallel applications. A Dryad application combines
                 computational ``vertices'' with communication ``channels'' to form a
                 dataflow graph. Dryad runs the application by executing the vertices of
                 this graph on a set of available computers, communicating as
                 appropriate through files, TCP pipes, and shared-memory FIFOs. The
                 vertices provided by the application developer are quite simple and are
                 usually written as sequential programs with no thread creation or
                 locking. Concurrency arises from Dryad scheduling vertices to run
                 simultaneously on multiple computers, or on multiple CPU cores within a
                 computer. The application can discover the size and placement of data
                 at run time, and modify the graph as the computation progresses to make
                 efficient use of the available resources. Dryad is designed to scale
                 from powerful multi-core single computers, through small clusters of
                 computers, to data centers with thousands of computers. The Dryad
                 execution engine handles all the difficult problems of creating a large
                 distributed, concurrent application: scheduling the use of computers
                 and their CPUs, recovering from communication or computer failures, and
                 transporting data between vertices.},
   note =	{Also as technical report MSR-TR-2006-140},
   url =	{http://budiu.info/work/eurosys07.pdf},
   confweb =	{http://www.eurosys.org/2007}
}

@InProceedings{budiu-asid06,
   author =	{Mihai Budiu and {\'U}lfar Erlingsson and Mart{\'\i}n Abadi},
   title =	{Architectural Support for Software-Based Protection},
   booktitle =	{Workshop on Architectural and System Support for Improving Software
                 Dependability (ASID)},
   pages =	{42--51},
   address =	{San Jose, CA},
   month =	{October 21},
   year =	{2006},
   abstract =	{Control-Flow Integrity (CFI) is a property that guarantees program
                 control flow cannot be subverted by a malicious adversary, even if the
                 adversary has complete control of data memory. We have shown in prior
                 work how CFI can be enforced by using inlined software guards that
                 perform safety checks. The first part of this paper shows how modest
                 Instruction Set Architecture (ISA) support can replace such guard code
                 with single instructions. \par On the foundation of CFI we have
                 implemented XFI: a protection system that offers fine-grained memory
                 access control and fundamental integrity guarantees for critical system
                 state. XFI can be seen as a flexible, generalized form of
                 software-based fault isolation (SFI). In the second part of this paper
                 we present ISA support for XFI, in the form of simple bounds-check
                 instructions. \par CFI and XFI can significantly increase the security
                 and integrity of software execution. Our results indicate that support
                 for CFI and XFI is a straightforward, simple addition to hardware
                 architectures. Compared to software guards, such hardware support
                 increases the efficiency and simplicity of enforcement.},
   note =	{Also as technical report MSR-TR-2006-115},
   url =	{http://budiu.info/work/asid06.pdf},
   confweb =	{http://opera.cs.uiuc.edu/ASID}
}

@TechReport{venkataramani-tr06,
   author =	{Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein},
   title =	{Modeling the Global Critical Path in Concurrent Systems},
   institution =	{Carnegie Mellon University, Computer Science Department},
   number =	{CMU-CS-06-144},
   pages =	{22},
   month =	{August},
   year =	{2006},
   abstract =	{We show how the global critical path can be used as a practical tool
                 for understanding, optimizing and summarizing the behavior of highly
                 concurrent self-timed circuits. Traditionally, critical path analysis
                 has been applied to DAGs, and thus is constrained to combinatorial
                 sub-circuits. We formally define the global critical path (GCP) and
                 show how it can be constructed using only local information that is
                 automatically derived directly from the circuit. We introduce a form of
                 Production Rules, which can accurately determine the GCP for a given
                 input vector even for modules which exhibit choice. \par The GCP
                 provides valuable insight into the behavior of circuits and, more
                 generally, into the control behavior of the application class. These
                 insights help in formulating new optimizations and re-formulating
                 existing ones to use the GCP knowledge. We have incorporated our
                 framework into a high-level synthesis tool-chain, which automatically
                 computes GCP, and utilizes the GCP for large scale circuits. We
                 demonstrate the effectiveness of the GCP framework by re-formulating
                 two traditional CAD optimizations to use the GCPyielding efficient
                 algorithms which improve circuit power (by up to 9\%) and performance
                 (by up to 60\%) in our experiments.},
   url =	{http://reports-archive.adm.cs.cmu.edu/anon/2006/abstracts/06-144.html}
}

@InProceedings{erlingsson-osdi06,
   author =	{{\'{U}}lfar Erlingsson and Mart{\'\i}n Abadi and Michael Vrable and Mihai Budiu and George C. Necula},
   title =	{{XFI}: Software Guards for System Address Spaces},
   booktitle =	{Symposium on Operating System Design and Implementation (OSDI)},
   pages =	{75--88},
   address =	{Seattle, WA},
   month =	{November 6-8},
   year =	{2006},
   abstract =	{XFI is a comprehensive protection system that offers both flexible
                 access control and fundamental integrity guarantees, at any privilege
                 level and even for legacy code in commodity systems. For this purpose,
                 XFI combines static analysis with inline software guards and a
                 two-stack execution model. We have implemented XFI forWindows on the
                 x86 architecture using binary rewriting and a simple, stand-alone
                 verifier; the implementationĘs correctness depends on the verifier, but
                 not on the rewriter. We have applied XFI to software such as device
                 drivers and multimedia codecs. The resulting modules function safely
                 within both kernel and user-mode address spaces, with only modest
                 enforcement overheads},
   url =	{http://budiu.info/work/osdi06.pdf},
   confweb =	{http://www.usenix.org/events/osdi06},
   acceptancerate =	{27/150=18\%}
}

@InProceedings{mishra-asplos06,
   author =	{Mahim Mishra and Timothy J. Callahan and Tiberiu Chelcea and Girish Venkataramani and Mihai Budiu and Seth C. Goldstein},
   title =	{{Tartan}: Evaluating Spatial Computation For Whole Program Execution},
   booktitle =	{International Conference on Architectural Support for Programming
                 Languages and Operating Systems (ASPLOS)},
   pages =	{163--174},
   address =	{San Jose, CA},
   month =	{October 21-25},
   year =	{2006},
   abstract =	{Spatial Computing (SC) has been shown to be an energy-efficient model
                 for implementing program kernels. In this paper we explore the
                 feasibility of using SC for more than small kernels. To this end, we
                 evaluate the performance and energy efficiency of entire applications
                 on Tartan, a general-purpose architecture which integrates a
                 reconfigurable fabric (RF) with a superscalar core. Our compiler
                 automatically partitions and compiles an application into an
                 instruction stream for the core and a configuration for the RF. We use
                 a detailed simulator to capture both timing and energy numbers for all
                 parts of the system. Our results indicate that a hierarchical RF
                 architecture, designed around a scalable interconnect, is instrumental
                 in harnessing the benefits of spatial computation. The interconnect
                 uses static configuration and routing at the lower levels and a
                 packet-switched, dynamically-routed network at the top level. Tartan is
                 most energyefficient when almost all of the application is mapped to
                 the RF, indicating the need for the RF to support most general-purpose
                 programming constructs. Our initial investigation reveals that such a
                 system can provide, on average, an order of magnitude improvement in
                 energy-delay compared to an aggressive superscalar core on
                 single-threaded workloads.},
   url =	{http://budiu.info/work/asplos06.pdf},
   confweb =	{http://www.princeton.edu/~asplos06/},
   acceptancerate =	{38/158=24\%}
}

@InProceedings{budiu-ispass05,
   author =	{Mihai Budiu and Pedro V. Artigas and Seth Copen Goldstein},
   title =	{Dataflow: A Complement to Superscalar},
   booktitle =	{IEEE International Symposium on Performance Analysis of Systems and
                 Software (ISPASS)},
   pages =	{177--186},
   address =	{Austin, TX},
   month =	{March 20-22},
   year =	{2005},
   abstract =	{There has been a resurgence of interest in dataflow architectures,
                 because of their potential for exploiting parallelism with low
                 overhead. In this paper we analyze the performance of a class of static
                 dataflow machines on integer media and control-intensive programs and
                 we explain why a dataflow machine, even with unlimited resources, does
                 not always outperform a superscalar processor on general-purpose codes,
                 under the assumption that both machines take the same time to execute
                 basic operations. We compare a program-specific dataflow machine with
                 unlimited parallelism to a superscalar processor running the same
                 program. While the dataflow machines provide very good performance on
                 most data-parallel programs, we show that the dataflow machine cannot
                 always take advantage of the available parallelism. Using the dynamic
                 critical path we investigate the mechanisms used by superscalar
                 processors to provide a performance advantage and their impact on a
                 dataflow model.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/ispass05.pdf},
   confweb =	{http://www.ispass.org/ispass2005},
   acceptancerate =	{27/92 = 29\%}
}

@InProceedings{budiu-odes05,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Inter-Iteration Scalar Replacement in the Presence of Conditional Control-Flow},
   booktitle =	{Workshop on Optimizations for {DSP} and Embedded Systems (ODES)},
   pages =	{20--29},
   address =	{San Jose, CA},
   month =	{March 20},
   year =	{2005},
   abstract =	{A large class of multimedia programs for embedded systems manipulate
                 data represented as dense matrices. In this paper we revisit the
                 classical optimization of scalar replacement of array elements and
                 pointer accesses; this optimization allocates array elements to
                 registers, reducing memory traffic. We generalize the state-of-the-art
                 algorithm, by Carr and Kennedy~\cite{carr-spe94}, improving it to
                 handle simultaneously both conditional control-flow and inter-iteration
                 data reuse. Our algorithm operates within the same assumptions of the
                 classical one (perfect dependence information), and has the same
                 limitations (increased register pressure). It is, however, optimal in
                 the sense that within each code region where scalar promotion is
                 applied, given sufficient registers, each memory location is
                 read/written at most once.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/odes05.pdf},
   confweb =	{http://www.ece.vill.edu/~deepu/odes/odes-3_program.html}
}

@inproceedings{abadi-icfem05,
   author =	{Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti},
   title =	{A Theory of Secure Control-Flow},
   booktitle =	{International Conference on Formal Engineering Methods (ICFEM)},
   pages =	{111-124},
   address =	{Manchester, UK},
   month =	{November 1-4},
   year =	{2005},
   abstract =	{Control-Flow Integrity (CFI) means that the execution of a program
                 dynamically follows only certain paths, in accordance with a static
                 policy. CFI can prevent attacks that, by exploiting buffer overflows
                 and other vulnerabilities, attempt to control program behavior. This
                 paper develops the basic theory that underlies two practical techniques
                 for CFI enforcement, with precise formulations of hypotheses and
                 guarantees.},
   url =	{http://budiu.info/work/icfem05.pdf},
   confweb =	{http://www.cs.man.ac.uk/icfem05}
}

@inproceedings{abadi-ccs05,
   author =	{Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti},
   title =	{Control-Flow Integrity},
   booktitle =	{ACM Conference on Computer and Communication Security (CCS)},
   pages =	{340-353},
   address =	{Alexandria, VA},
   month =	{November 7-11},
   year =	{2005},
   abstract =	{Current software attacks often build on exploits that subvert
                 machine-code execution. The enforcement of a basic safety property,
                 Control-Flow Integrity (CFI), can prevent such attacks from arbitrarily
                 controlling program behavior. CFI enforcement is simple, and its
                 guarantees can be established formally, even with respect to powerful
                 adversaries. Moreover, CFI enforcement is practical: it is compatible
                 with existing software and can be efficiently implemented using
                 software rewriting in commodity systems. Finally, CFI provides a useful
                 foundation for enforcing further security policies, such as policies
                 that constrain the use of data memory.},
   url =	{http://budiu.info/work/ccs05.pdf},
   confweb =	{http://www.acm.org/sigs/sigsac/ccs/CCS2005},
   acceptancerate =	{38/250=25\%}
}

@InProceedings{budiu-asplos04,
   author =	{Mihai Budiu and Girish Venkataramani and Tiberiu Chelcea and Seth Copen Goldstein},
   title =	{Spatial Computation},
   booktitle =	{International Conference on Architectural Support for Programming
                 Languages and Operating Systems (ASPLOS)},
   pages =	{14--26},
   address =	{Boston, MA},
   month =	{October 9-13},
   year =	{2004},
   abstract =	{This paper describes a computer architecture that relies on the direct
                 translation of high-level language programs into {\em Spatial
                 Computation} (SC) hardware structures. SC program implementations are
                 completely distributed, without any centralized control. SC circuits
                 are optimized for {\em wires} at the expense of computation units. \par
                 In this paper we investigate a particular implementation SC structures
                 called ASH (Application-Specific Hardware). Under the assumption that
                 computation is cheaper than communication, ASH replicates computation
                 units to simplify interconnect, building a system which uses very
                 simple, completely dedicated communication channels. As a consequence,
                 communication on the datapath never requires arbitration; the only
                 arbitration required is for accessing memory. ASH relies on very simple
                 hardware primitives, using no associative structures, no multiported
                 register files, no scheduling logic, no broadcast, and no clocks. As a
                 consequence, ASH hardware is fast and extremely power efficient. \par
                 In this work we demonstrate three features of ASH: (1) that such
                 architectures can be built by automatic compilation of C programs, (2)
                 that distributed computation is in some respects fundamentally
                 different from monolithic superscalar processors and (3) that ASIC
                 implementations of ASH use 3 orders of magnitude less energy compared
                 to high-end superscalar processors, while being within a factor of two
                 in performance.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/asplos04.pdf},
   doi =	{http://doi.acm.org/10.1145/1024393.1024396},
   confweb =	{http://www.eecg.toronto.edu/asplos2004},
   acceptancerate =	{24/169 = 14\%}
}

@InProceedings{koes-msp04,
   author =	{David Koes and Mihai Budiu and Girish Venkataramani and Seth Copen Goldstein},
   title =	{Programmer Specified Pointer Independence},
   booktitle =	{Workshop on Memory System Performance (MSP)},
   month =	{June},
   year =	{2004},
   abstract =	{Good alias analysis is essential in order to achieve high performance
                 on modern processors, yet interprocedural analysis does not scale well.
                 We present a source code annotation, {\tt \#pragma independent}, which
                 is a more flexible, intuitive and useful way for the programmer to
                 provide pointer aliasing information than the current C99 {\tt
                 restrict} keyword. We describe a tool which highlights the most
                 important and most likely correct locations at which a programmer can
                 insert the pragmas. We show that such annotations can be used
                 effectively in compilers to achieve speedups of up to 1.2x.},
   note =	{Also as technical report CMU-CS-03-123},
   url =	{http://www.cs.cmu.edu/~mihaib/research/msp04.pdf},
   confweb =	{http://cs.anu.edu.au/~Steve.Blackburn/msp2004}
}

@InProceedings{venkataramani-iwls04,
   author =	{Girish Venkataramani and Mihai Budiu and Seth Copen Goldstein},
   title =	{{C} to Asynchronous Dataflow Circuits: An End-to-End Toolflow},
   booktitle =	{International Workshop on Logic synthesis (IWLS)},
   pages =	{501--508},
   address =	{Temecula, CA},
   month =	{June},
   year =	{2004},
   abstract =	{We present a complete toolflow that translates ANSI-C programs into
                 asynchronous circuits. The toolflow is built around a compiler that
                 converts C into a functional dataflow intermediate representation,
                 exposing instruction-level, pipeline and memory parallelism. The
                 compiler performs optimizations and converts the intermediate
                 representation into pipelined asynchronous circuits, with no
                 centralized controllers. In the resulting circuits, control is
                 distributed, communication is achieved through local wires, and
                 arbitration for datapath resources is unnecessary. Circuits
                 automatically synthesized from Mediabench kernels exhibit substantially
                 better energy-delay than either single-issue processors or aggressive
                 superscalar cores.},
   note =	{(full paper)},
   url =	{http://www.cs.cmu.edu/~mihaib/research/iwls04.pdf},
   confweb =	{http://www.iwls.org/}
}

@InProceedings{budiu-cgo03,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Optimizing Memory Accesses For Spatial Computation},
   booktitle =	{International ACM/IEEE Symposium on Code Generation and Optimization
                 (CGO)},
   pages =	{216-227},
   address =	{San Francisco, CA},
   month =	{March 23--26},
   year =	{2003},
   abstract =	{In this paper we present the internal representation and optimizations
                 used by the CASH compiler for improving the memory parallelism of
                 pointer-based programs. CASH uses an SSA-based representation for
                 memory, which compactly summarizes both control-flow and dependence
                 information. In CASH, memory optimization is a four-step process: (1)
                 first an initial, relatively coarse, representation of memory
                 dependences is built; (2) next, unnecessary memory dependences are
                 removed using dependence tests; (3) third, redundant memory operations
                 are removed (4) finally, parallelism is increased by pipelining memory
                 accesses in loops. While the first three steps above are very general,
                 the loop pipelining transformations are particularly applicable for
                 spatial computation, which is the primary target of CASH. The redundant
                 memory removal optimizations presented are: load/store hoisting
                 (subsuming partial redundancy elimination and common-subexpression
                 elimination), load-after-store removal, store-before-store removal
                 (dead store removal) and loop-invariant load motion. One of our loop
                 pipelining transformations is a new form of loop parallelization,
                 called loop decoupling. This transformation separates independent
                 memory accesses within a loop body into several independent loops,
                 which are allowed dynamically to slip with respect to each other. A new
                 computational primitive, a token generator is used to dynamically
                 control the amount of slip, allowing maximum freedom, while
                 guaranteeing that no memory dependences are violated.},
   url =	{http://www-2.cs.cmu.edu/~mihaib/research/cgo03.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/cgo/2003/1913/00/19130216abs.htm},
   confweb =	{http://www.cgo.org/cgo2003}
}

@InProceedings{goldstein-asap03,
   author =	{Seth Goldstein and Mihai Budiu and Mahim Mishra and Girish Venkataramani},
   title =	{Reconfigurable Computing and Electronic Nanotechnology},
   booktitle =	{IEEE International Conference on Application-specific Systems,
                 Architectures and Processors},
   pages =	{132--143},
   address =	{Hague, the Netherlands},
   month =	{June 24-26},
   year =	{2003},
   abstract =	{In this paper we examine the opportunities brought about by recent
                 progress in electronic nanotechnology and describe the methods needed
                 to harness them for building a new computer architecture. In this
                 process we decompose some traditional abstractions, such as the
                 transistor, into fine-grain pieces, such as signal restoration and
                 input-output isolation. We also show how we can forgo the extreme
                 reliability of CMOS circuits for low-cost chemical self-assembly at the
                 expense of large manufacturing defect densities. We discuss advanced
                 testing methods which can be used to recover perfect functionality from
                 unreliable parts. We proceed to show how the molecular switch, the
                 regularity of the circuits created by self-assembly and the high defect
                 densities logically require the use of reconfigurable hardware as a
                 basic building block for hardware design. We then capitalize on the
                 convergence of compilation and hardware synthesis (which takes place
                 when programming reconfigurable hardware) to propose the complete
                 elimination of the instruction-set architecture from the system
                 architecture, and the synthesis of asynchronous dataflow machines
                 directly from high-level programming languages, such as C. We discuss
                 in some detail a scalable compilation system that perform this task.},
   note =	{Invited paper},
   url =	{http://www.cs.cmu.edu/~mihaib/research/asap03.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/asap/2003/1992/00/19920132abs.htm},
   confweb =	{http://www-ece.rice.edu/asap2003/program03.html}
}

@PhdThesis{budiu-phd03,
   key =	{PhD Thesis 03},
   author =	{Mihai Budiu},
   title =	{Spatial Computation},
   school =	{Carnegie Mellon University, Computer Science Department},
   number =	{CMU-CS-03-217},
   pages =	{225},
   month =	{December},
   year =	{2003},
   abstract =	{This thesis presents a compilation framework for translating ANSI C
                 programs into hardware dataflow machines. The framework is embodied in
                 the CASH compiler, a Compiler for Application-Specific Hardware. CASH
                 generates asynchronous hardware circuits that directly implement the
                 functionality of the source program, without using any interpretative
                 structures. This style of computation is dubbed ``Spatial
                 Computation''. CASH relies extensively on predication and speculation
                 for building efficient hardware circuits. \par The first part of this
                 document describes Pegasus, the internal representation of CASH, and a
                 series of novel program transformations performed by CASH, the most
                 notable of which are a new optimal register-promotion algorithm and
                 partial redundancy elimination for memory accesses based on predicate
                 manipulation. \par The second part of this document evaluates the
                 performance of the generated circuits using simulation. Using media
                 processing benchmarks, we show that for the domain of embedded
                 computation, the circuits generated by CASH can sustain high levels of
                 instruction level parallelism, due to the effective use of dataflow
                 software pipelining. A comparison of Spatial Computation and
                 superscalar processors highlights some of the weaknesses of our model
                 of computation, such as the lack of branch prediction and register
                 renaming. \par The results presented in this document can be applied in
                 several domains: (1) most of the compiler optimizations are applicable
                 to traditional compilers for high-level languages (2) CASH itself can
                 be used as a hardware synthesis tool for very fast system-on-a-chip
                 prototyping directly from C sources, (3) the compilation framework we
                 describe can be applied to the translation of imperative languages to
                 dataflow machines, (4) we have extended the dataflow machine model to
                 encompass predication, data-speculation and control-speculation, and
                 (5) the tool-chain described and some specific optimizations, such as
                 lenient execution, and pipeline balancing, can be used for synthesis
                 and optimization of asynchronous hardware.},
   note =	{Technical report CMU-CS-03-217},
   url =	{http://www.cs.cmu.edu/~mihaib/research/thesis.pdf},
   url2 =	{http://reports-archive.adm.cs.cmu.edu/anon/2003/abstracts/03-217.html}
}

@InBook{goldstein-chapter03,
   key =	{Chapter 03},
   author =	{Seth Copen Goldstein and Mihai Budiu},
   editor =	{Mark A. Reed and Takhee Lee},
   title =	{Molecules, Gates, Circuits, Computers},
   chapter =	{in Molecular Nanoelectronics},
   pages =	{327--388},
   publisher =	{American Scientific Publishers},
   month =	{January},
   year =	{2003},
   url =	{http://aspbs.com/molecularnano.html}
}

@InProceedings{budiu-fpl02,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Compiling Application-Specific Hardware},
   booktitle =	{International Conference on Field Programmable Logic and Applications
                 (FPL)},
   pages =	{853--863},
   address =	{Montpellier (La Grande-Motte), France},
   month =	{September 2--4},
   year =	{2002},
   abstract =	{In this paper we describe ASH, an architectural framework for
                 implementing Application-Specific Hardware. ASH is based on automatic
                 hardware synthesis from high-level languages. The generated circuits
                 use only localized computation structures; in consequence, we expect
                 these circuits to be fast, to use little power and to scale well with
                 program complexity. \par We present in detail CASH, a scalable compiler
                 framework for ASH, which generates hardware from programs written in C.
                 Our compiler exploits instruction level parallelism by using aggressive
                 speculation and dynamic scheduling. Based on this compilation scheme,
                 we evaluate the computational resources necessary for implementing
                 complex integer-based programs, and we suggest architectural features
                 that would support the ASH framework.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpl02-budiu.pdf},
   url2 =	{http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=853},
   confweb =	{http://www.lirmm.fr/fpl2002/home.html}
}

@InProceedings{venkataramani-fpl02,
   key =	{FPL 02a},
   author =	{Girish Venkataramani and Suraj Sudhir and Mihai Budiu and Seth Copen Goldstein},
   title =	{Factors Influencing the Performance of a CPU-RFU Hybrid Architecture},
   booktitle =	{International Conference on Field Programmable Logic and Applications
                 (FPL)},
   pages =	{955--965},
   address =	{Montpellier (La Grande-Motte), France},
   month =	{September},
   year =	{2002},
   abstract =	{Closely coupling a reconfigurable fabric with a conventional processor
                 has been shown to successfully improve the system performance. However,
                 today s superscalar pro-cessors are both complex and adept at
                 extracting Instruction Level Parallelism (ILP), which introduces many
                 complex issues to the design of a hybrid CPU-RFU system. This paper
                 examines the design of a superscalar processor augmented with a
                 closely-coupled recon-figurable fabric. It identifies architectural and
                 compiler issues that affect the performance of the overall system.
                 Previous efforts at combining a processor core with a reconfigurable
                 fabric are examined in the light of these issues. We also present
                 simulation results that emphasize the impact of these factors.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpl02-girish.pdf},
   url2 =	{http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=955},
   confweb =	{http://www.lirmm.fr/fpl2002/home.html}
}

@InProceedings{budiu-fccm02,
   author =	{Mihai Budiu and Mahim Mishra and Ashwin Bharambe and Seth Copen Goldstein},
   title =	{Peer-to-peer Hardware-Software Interfaces for Reconfigurable Fabrics},
   booktitle =	{IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM)},
   pages =	{57--66},
   address =	{Napa Valley, CA},
   month =	{April},
   year =	{2002},
   abstract =	{In this paper we describe a peer-to-peer interface between processor
                 cores and reconfigurable fabrics. The main advantage of the
                 peer-to-peer model is that it greatly expands the scope of application
                 for reconfigurable computing and hence its potential benefits. The
                 primary extension in our model is that ``code'' on the reconfigurable
                 hardware unit is allowed to invoke routines both on the reconfigurable
                 unit itself and on the fixed logic processor. We describe the software
                 constructs and compilation mechanisms needed for such an architecture,
                 including a detailed description of the interface between the two parts
                 of the application.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fccm02-peer.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/fccm/2002/1801/00/18010057abs.htm},
   confweb =	{http://www.fccm.org/preprog02.html}
}

@InProceedings{goldstein-isca01,
   author =	{Seth Copen Goldstein and Mihai Budiu},
   title =	{{NanoFabrics}: Spatial Computing Using Molecular Electronics},
   booktitle =	{International Symposium on Computer Architecture (ISCA)},
   pages =	{178--189},
   address =	{G\"{o}teborg, Sweden},
   year =	{2001},
   abstract =	{The continuation of the remarkable exponential increases in processing
                 power over the recent past faces imminent challenges due in part to the
                 physics of deep-submicron CMOS devices and the costs of both chip masks
                 and future fabrication plants. A promising solution to these problems
                 is offered by an alternative to CMOS-based computing, chemically
                 assembled electronic nanotechnology (CAEN). In this paper we outline
                 how CAEN based computing can become a reality. We briefly describe
                 recent work in CAEN and how CAEN will affect computer architecture. We
                 show how the inherently reconfigurable natures of CAEN devices can be
                 exploited to provide high-density chips with defect tolerance which
                 will significantly reduce the cost of manufacturing. After developing
                 the basic building blocks of a CAEN based computing devices we present
                 some preliminary results which indicate that CAEN based computing
                 devices can meet or exceed the performance of CMOS based devices.},
   url =	{http://www.cs.cmu.edu/~seth/papers/isca01.pdf},
   confweb =	{http://www.ce.chalmers.se/conf2001},
   doi =	{http://doi.acm.org/10.1145/379240.379262}
}

@InProceedings{budiu-europar00,
   key =	{EuroPar 00},
   author =	{Mihai Budiu and Majd Sakr and Kip Walker and Seth Copen Goldstein},
   title =	{{BitValue} Inference: Detecting and Exploiting Narrow Bitwidth Computations},
   booktitle =	{European Conference on Parallel Processing (EUROPAR)},
   series =	{Lecture Notes in Computer Science},
   volume =	{1900},
   pages =	{969--979},
   publisher =	{Springer Verlag},
   address =	{M\"{u}nich, Germany},
   year =	{2000},
   abstract =	{We present a compiler algorithm called BitValue, which can discover
                 both unused and constant bits in dusty-deck C programs. BitValue uses
                 forward and backward dataflow analyses, generalizing constant-folding
                 and dead-code detection at the bit-level. This algorithm enables
                 compiler optimizations which target special processor architectures for
                 computing on non-standard bitwidths. Using this algorithm we show that
                 up to 31\% of the computed bytes are thrown away (for programs from
                 SpecINT95 and Mediabench). A compiler for reconfigurable hardware uses
                 this algorithm to achieve substantial reductions (up to 20-fold) in the
                 size of the synthesized circuits.},
   note =	{An expanded version is in technical report CMU-CS-00-141},
   url =	{http://www.cs.cmu.edu/~mihaib/research/europar00.pdf},
   url2 =	{http://link.springer.de/link/service/series/0558/papers/1900/19000969.pdf},
   confweb =	{http://www.informatik.uni-trier.de/GI/FG-014/Announce/2000/EuroPar2000.CFP.html}
}

@Article{goldstein-ieee00,
   key =	{Computer 00},
   author =	{Seth Copen Goldstein and Herman Schmit and Mihai Budiu and Srihari Cadambi and Matt Moe and Reed Taylor},
   title =	{{PipeRench}: A Reconfigurable Architecture and Compiler},
   journal =	{IEEE Computer},
   volume =	{33},
   number =	{4},
   pages =	{70--77},
   month =	{April},
   year =	{2000},
   abstract =	{With the proliferation of highly specialized embedded computer systems
                 has come a diversification of workloads for computing devices.
                 General-purpose processors are struggling to efficiently meet these
                 applications' disparate needs, and custom hardware is rarely feasible.
                 According to the authors, reconfigurable computing, which combines the
                 flexibility of general-purpose processors with the efficiency of custom
                 hardware, can provide the alternative. PipeRench and its associated
                 compiler comprise the authors' new architecture for reconfigurable
                 computing. Combined with a traditional digital signal processor,
                 microcontroller or general-purpose processor, PipeRench can support a
                 system's various computing needs without requiring custom hardware. The
                 authors describe the PipeRench architecture and how it solves some of
                 the pre-existing problems with FPGA architectures, such as logic
                 granularity, configuration time, forward compatibility, hard
                 constraints and compilation time.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/computer.pdf},
   url2 =	{http://ieeexplore.ieee.org/iel5/2/18132/00839324.pdf}
}

@Article{birman-tocs99,
   author =	{Kenneth P. Birman and Mark Hayden and Oznur Oskasap and Zhen Xiao and Mihai Budiu and Yaron Minsky},
   title =	{Bimodal Multicast},
   journal =	{Transactions on Computer Systems (TOCS)},
   volume =	{17},
   number =	{2},
   pages =	{41--88},
   month = may,
   year =	{1999},
   abstract =	{There are many methods for making a multicast protocol ``reliable''.
                 At one end of thespectrum, a reliable multicast protocol might offer
                 atomicity guarantees, such as all-ornothing delivery, delivery
                 ordering, and perhaps additional properties such as
                 virtuallysynchronous addressing. At the other are protocols that use
                 local repair to overcome transient packet loss in the network, offering
                 ``best effort'' reliability. Yet none of this priorwork has treated
                 stability of multicast delivery as a basic reliability property, such
                 as might be needed in an internet radio, TV, or conferencing
                 application. This paper looks atreliability with a new goal:
                 development of a multicast protocol which is reliable in a sense that
                 can be rigorously quantified and includes throughput stability
                 guarantees. We characterize this new protocol as a ``bimodal
                 multicast'' in reference to its reliability model, which corresponds to
                 a family of bimodal probability distributions. Here, we introduce
                 theprotocol, provide a theoretical analysis of its behavior, review
                 experimental results, and discuss some candidate applications. These
                 confirm that bimodal multicast is reliable,scalable, and that the
                 protocol provides remarkably stable delivery throughput.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/pbcast.pdf},
   doi =	{http://doi.acm.org/10.1145/312203.312207},
   url2 =	{http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p41-birman}
}

@InProceedings{goldstein-isca99,
   author =	{Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer},
   title =	{{PipeRench}: a Coprocessor for Streaming Multimedia Acceleration},
   booktitle =	{International Symposium on Computer Architecture (ISCA)},
   pages =	{28--39},
   address =	{Atlanta, GA},
   year =	{1999},
   abstract =	{Future computing workloads will emphasize an architecture's ability to
                 perform relatively simple calculations on massive quantities of
                 mixed-width data. This paper describes a novel reconfigurable fabric
                 architecture, PipeRench, optimized to accelerate these types of
                 computations. PipeRench enables fast, robust compilers, supports
                 forward compatibility, and virtualizes configurations, thus removing
                 the fixed size constraint present in other fabrics. For the first time
                 we explore how the bit-width of processing elements affects performance
                 and show how the PipeRench architecture has been optimized to balance
                 the needs of the compiler against the realities of silicon. Finally, we
                 demonstrate extreme performance speedup on certain computing kernels
                 (up to 190x versus a modern RISC processor), and analyze how this
                 acceleration translates to application speedup.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/isca99.pdf},
   confweb =	{http://www.informatik.uni-trier.de/~ley/db/conf/isca/isca99.html},
   doi =	{http://doi.acm.org/10.1145/300979.300982}
}

@InProceedings{budiu-fpga99,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Fast Compilation for Pipelined Reconfigurable Fabrics},
   booktitle =	{ACM/SIGDA International Symposium on Field Programmable Gate Arrays
                 (FPGA)},
   pages =	{195--205},
   address =	{Monterey, CA},
   year =	{1999},
   abstract =	{In this paper we describe a compiler which quickly synthesizes high
                 quality pipelined datapaths for pipelined reconfigurable devices. The
                 compiler uses the same internal representation to perform synthesis,
                 module generation, optimization, and place and route. The core of the
                 compiler is a linear time place and route algorithm more than two
                 orders of magnitude faster than traditional CAD tools. The key behind
                 our approach is that we never backtrack, rip-up, or re-route. Instead,
                 the graph representing the computation is preprocessed to guarantee
                 routability by inserting lazy noops. The preprocessing steps provides
                 enough information to make a greedy strategy feasible. The compilation
                 speed is approximately 3000 bit-operations/second (on a PII/400Mhz) for
                 a wide range of applications. The hardware utilization averages 60\% on
                 the target device, PipeRench.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpga99.pdf},
   confweb =	{http://portal.acm.org/toc.cfm?id=296399&dl=portal&dl=ACM&type=proceeding&idx=SERIES100&part=Proceedings&WantType=Proceedings&title=International%20Symposium%20on%20Field%20Programmable%20Gate%20Arrays},
   doi =	{http://doi.acm.org/10.1145/296399.296459}
}