Bibliography/aps.bib

@inproceedings{Shahabuddin2017,
	title = "ASIP design for multiuser MIMO broadcast precoding",
	DOI = "10.1109/EuCNC.2017.7980691",
	year = "2017",
	booktitle = "2017 European Conference on Networks and Communications (EuCNC)",
	pages = "1--4",
	author = "Shahabuddin, Shahriar and Silvén, Olli and Juntti, Markku",
	abstract = "This paper presents an application-specific instruction-set processor (ASIP) for multiuser multiple-input multiple-output (MU-MIMO) broadcast precoding. The ASIP is designed for a base station (BS) with four antennas to perform user scheduling and precoding. Transport triggered architecture (TTA) is used as the processor template and high level language is used to program the ASIP. Several special function units (SFU) are designed to accelerate norm-based greedy user scheduling and minimum-mean square error (MMSE) precoding. We also program zero forcing dirty paper coding (ZF-DPC) to demonstrate the reusability of the ASIP. A single core provides a throughput of 52.17 Mbps for MMSE precoding and takes an area of 87.53 kgates at 200 MHz on 90 nm technology.",
	file = "MOVE: A Framework for High-Performance Processor Design-07980691.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=13{\&}resourceId=37{\&}filename=a5f894ceb4b9d293397e935fd5595bb0b3806eea:pdf"
}

@article{Shahbazi2017,
	title = "Design and implementation of an ASIP-based cryptography processor for AES, IDEA, and MD5",
	ISSN = "2215-0986",
	DOI = "https://doi.org/10.1016/j.jestch.2017.07.002",
	volume = "20",
	number = "4",
	year = "2017",
	URL = "https://www.sciencedirect.com/science/article/pii/S2215098617300885",
	journal = "Engineering Science and Technology, an International Journal",
	pages = "1308--1317",
	author = "Shahbazi, Karim and Eshghi, Mohammad and Mirzaee, Reza Faghih",
	abstract = "In this paper, a new 32-bit ASIP-based crypto processor for AES, IDEA, and MD5 is designed. The instruction-set consists of both general purpose and specific instructions for the above cryptographic algorithms. The proposed architecture has nine function units and two data buses. It has also two types of 32-bit instruction formats for executing Memory Reference (M.R.), Register Reference (R.R.), and Input/Output Reference (I/O R.) instructions. The maximum achieved frequency is 166.916MHz. The encoded output results of the encryption process of a 128-bit input block are obtained after 122, 146 and 170 clock cycles for AES-128, AES-192, and AES-256, respectively. Moreover, it takes 95 clock cycles to encrypt or decrypt a 64-bit input block by using IDEA. Finally, the MD5 hash algorithm requires 469 clock cycles to generate the coded outputs for a block of 512bits. The performance of the proposed processor is compared to some previous and state-of-the-art implementations in terms of speed, latency, throughput, and flexibility.",
	file = "MOVE: A Framework for High-Performance Processor Design-1-s2.0-S2215098617300885-main.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=12{\&}resourceId=36{\&}filename=df4677a0755cfe1d4c8470892cefeac1e4147f43:pdf",
	keywords = "ASIP,AES,Crypto Processor,IDEA,MD5"
}

@inproceedings{Arnold2001,
	title = "Designing Domain-Specific Processors",
	ISBN = "1581133642",
	DOI = "10.1145/371636.371677",
	series = "CODES '01",
	year = "2001",
	URL = "https://doi.org/10.1145/371636.371677",
	booktitle = "Proceedings of the Ninth International Symposium on Hardware/Software Codesign",
	pages = "61–66",
	author = "Arnold, Marnix and Corporaal, Henk",
	abstract = "We present a semi-automated method for the detection and exploitation of application domain specific instruction set extensions for embedded (VLIW) processors. It consists of three steps: the first step detects frequently occurring operation patterns, in the second step, the patterns are grouped and implemented in a number of Special Function Units (SFUs) and the third step incorporates the custom operations into the code generation process.Experiments show that the SFUs generated and exploited with our methodology can result in architectures that perform up to 30% better than architectures of the same cost without SFUs.",
	publisher = "Association for Computing Machinery",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-arnold2001.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=5{\&}resourceId=27{\&}filename=9ac18add40bd4cbbd1ef6c2c7315177d720f9032:pdf",
	keywords = "instruction set synthesis,design space exploration"
}

@inproceedings{Cheng2004,
	title = "FITS: Framework-Based Instruction-Set Tuning Synthesis for Embedded Application Specific Processors",
	ISBN = "1581138288",
	DOI = "10.1145/996566.996810",
	series = "DAC '04",
	year = "2004",
	URL = "https://doi.org/10.1145/996566.996810",
	booktitle = "Proceedings of the 41st Annual Design Automation Conference",
	pages = "920–923",
	author = "Cheng, Allen and Tyson, Gary and Mudge, Trevor",
	abstract = "We propose a new instruction synthesis paradigm that falls between a general-purpose embedded processor and a synthesized application specific processor (ASP). This is achieved by replacing the fixed instruction and register decoding of general purpose embedded processor with programmable decoders that can achieve ASP performance with the fabrication advantages of a mass produced single chip solution.",
	publisher = "Association for Computing Machinery",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-cheng2004.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=9{\&}resourceId=31{\&}filename=26fcc9421feca0398d0718e7010b691880028681:pdf",
	keywords = "instruction synthesis,code density,reconfigurable processors,energy efficient,configurable architecture,16-bit ISA,instruction encoding,ASP,low-power,embedded processor"
}

@inproceedings{Cong2004,
	title = "Application-Specific Instruction Generation for Configurable Processor Architectures",
	ISBN = "1581138296",
	DOI = "10.1145/968280.968307",
	series = "FPGA '04",
	year = "2004",
	URL = "https://doi.org/10.1145/968280.968307",
	booktitle = "Proceedings of the 2004 ACM/SIGDA 12th International Symposium on Field Programmable Gate Arrays",
	pages = "183–189",
	author = "Cong, Jason and Fan, Yiping and Han, Guoling and Zhang, Zhiru",
	abstract = "Designing an application-specific embedded system in nanometer technologies has become more difficult than ever due to the rapid increase in design complexity and manufacturing cost. Efficiency and flexibility must be carefully balanced to meet different application requirements. The recently emerged configurable and extensible processor architectures offer a favorable tradeoff between efficiency and flexibility, and a promising way to minimize certain important metrics (e.g., execution time, code size, etc.) of the embedded processors. This paper addresses the problem of generating the application-specific instructions to improve the execution speed for configurable processors. A set of algorithms, including pattern generation, pattern selection, and application mapping, are proposed to efficiently utilize the instruction set extensibility of the target configurable processor. Applications of our approach to several real-life benchmarks on the Altera Nios processor show encouraging performance speedup (2.75X on average and up to 3.73X in some cases).",
	publisher = "Association for Computing Machinery",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-cong2004.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=6{\&}resourceId=28{\&}filename=01a872d6fdec478fa92e11bb0374654229c1936a:pdf",
	keywords = "ASIP,configurable processor,compilation,binate covering,technology mapping"
}

@inproceedings{Corporaal1991,
	title = "MOVE: A Framework for High-Performance Processor Design",
	ISBN = "897914597",
	DOI = "10.1145/125826.126159",
	series = "Supercomputing '91",
	year = "1991",
	URL = "https://doi.org/10.1145/125826.126159",
	booktitle = "Proceedings of the 1991 ACM/IEEE Conference on Supercomputing",
	pages = "692–701",
	author = "Corporaal, Henk and Mulder, Hans ) (J. M.",
	publisher = "Association for Computing Machinery",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-corporaal1991.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=4{\&}resourceId=26{\&}filename=883aa5e10678cebe6262f47f1ed98c0f9f78e653:pdf"
}

@article{Good2006,
	title = "Very small FPGA application-specific instruction processor for AES",
	ISSN = "1558-0806",
	DOI = "10.1109/TCSI.2006.875179",
	volume = "53",
	number = "7",
	year = "2006",
	journal = "IEEE Transactions on Circuits and Systems I: Regular Papers",
	pages = "1477--1486",
	author = "Good, T. and Benaissa, M.",
	abstract = "This paper presents two low-area designs for the advanced encryption standard on field-programmable gate arrays (FPGAs). Both these designs are believed to be the smallest to date. The first design is an 8-bit application-specific instruction processor, which supports key expansion (currently programmed for a 128-bit key), encipher and decipher. The design utilizes less than 60% of the resources of the smallest available Xilinx Spartan II FPGA (XC2S15). The average encipher-decipher throughput is 2.1 Mbps when clocked at 70 MHz. The design has numerous applications where low area and low power are priorities. The second design, using the Xilinx PicoBlaze soft core is included to provide an embedded 8-bit microcontroller comparison baseline.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-good2006.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=7{\&}resourceId=29{\&}filename=4b28d3233609e0b7d7b41e3ab85c2aa0bea3b171:pdf",
	keywords = "application specific integrated circuits;microcontrollers;field programmable gate arrays;cryptography;instruction sets;low-power electronics;embedded systems;logic design;application-specific instruction processor;advanced encryption standard;field-programmable gate arrays;encipher process;decipher process;Xilinx Spartan II FPGA;Xilinx PicoBlaze soft core;embedded microcontroller;2.1 Mbit/s;70 MHz;Field programmable gate arrays;Throughput;Cryptography;Hardware;Microcontrollers;Application specific processors;Application specific integrated circuits;Clocks;Government;Aging;8 bit;advanced encryption standard (AES);application-specific instruction processor (ASIP);field-programmable gate array (FPGA);low area"
}

@article{Jacome2000,
	title = "Design challenges for new application specific processors",
	ISSN = "1558-1918",
	DOI = "10.1109/54.844333",
	volume = "17",
	number = "2",
	year = "2000",
	URL = "https://ieeexplore.ieee.org/abstract/document/844333",
	journal = "IEEE Design Test of Computers",
	pages = "40--50",
	author = "Jacome, M. F. and Veciana, De G.",
	abstract = "Embedded systems form a market that is already larger and growing more rapidly than that of general-purpose computers. In fact, real-time multimedia and signal processing embedded applications currently account for over 90% of all computer cycles. This article discusses challenges in developing retargetable compilers and synthesis tools for application-specific processor cores targeted at embedded portable digital communications and multimedia systems.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-jacome2000.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=3{\&}resourceId=25{\&}filename=b8be69f5ee2430ace4a3e0345086998b70d17c47:pdf",
	keywords = "digital communication;program compilers;multimedia systems;embedded systems;design challenges;application specific processors;embedded systems;real-time multimedia systems;signal processing embedded applications;retargetable compilers;synthesis tools;application-specific processor;digital communications;Application specific processors;VLIW;Multimedia systems;Digital signal processing;Registers;Concurrent computing;Parallel processing;Logic;Discrete cosine transforms;Motion estimation"
}

@inproceedings{Matai2012,
	title = "Trimmed VLIW: Moving application specific processors towards high level synthesis",
	year = "2012",
	URL = "https://cseweb.ucsd.edu/~jmatai/publications/TrimmedVLIW.pdf",
	booktitle = "2012 Electronic System Level Synthesis Conference",
	pages = "11--16",
	author = "Matai, J. and Oberg, J. and Irturk, A. and Kim, T. and Kastner, R.",
	abstract = "We describe a synthesis methodology called Trimmed VLIW, which we argue lies between application specific processors and high level synthesis. Much like application specific processors, our methodology starts from a known instruction set architecture and customizes it to create the final implementation. However, our approach goes further as we not only add custom functional units and define the parameters of the register file, but we also remove unneeded interconnect, which results in a data path that looks more similar to that created by high level synthesis tools. We show that there are substantial opportunities for eliminating unused resources, which results in an architecture that has significantly smaller area. We compare area, delay and performance results of a base architecture with trimmed one. Preliminary results show by only trimming wires we have an average of 25% area reduction while improving the performance around 5%. Furthermore, we evaluated our results with high-level synthesize tools C2V and AutoESL.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-TrimmedVLIW.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=1{\&}resourceId=23{\&}filename=8051cdee9de7aaf55d7f27bb06e93022f7bf95d7:pdf",
	keywords = "ant colony optimisation;application specific integrated circuits;high level synthesis;instruction sets;trimmed VLIW;application specific processors;high-level synthesis tool;instruction set architecture;custom functional units;register file parameter;data path;C2V tool;AutoESL tool;ant colony optimization;Registers;Program processors;Wires;VLIW;Resource management;Algorithm design and analysis;High level synthesis"
}

@inproceedings{Nohl2010,
	title = "Application specific processor design: Architectures, design methods and tools",
	DOI = "10.1109/ICCAD.2010.5653632",
	year = "2010",
	booktitle = "2010 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)",
	pages = "349--352",
	author = "Nohl, A. and Schirrmeister, F. and Taussig, D.",
	abstract = "In this tutorial paper, we will outline a solution for prototyping, programming and implementing Application Specific Instruction-set Processors (ASIPs). A general introduction into this class of processor architectures and their characteristics is provided. The Synopsys Processor Designer tool suite and the LISA language for ASIP design are jointly introduced in the context of a H.264 design example. Finally, implementation results are presented.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-nohl2010.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=8{\&}resourceId=30{\&}filename=6b18bdf517b8ee9bcd5597d763b28da4f20bf5c3:pdf",
	keywords = "application specific integrated circuits;instruction sets;integrated circuit design;microprocessor chips;application specific processor design;design methods;application specific instruction-set processors;processor architectures;Synopsys processor designer tool;LISA language;ASIP design;H.264;Pixel;Registers;Finite impulse response filter;Computer architecture;Software;Pipeline processing;Encoding"
}

@inproceedings{Plagwitz2019,
	title = "Compiler-Based High-Level Synthesis of Application-Specific Processors on FPGAs",
	DOI = "10.1109/ReConFig48160.2019.8994778",
	year = "2019",
	booktitle = "2019 International Conference on ReConFigurable Computing and FPGAs (ReConFig)",
	pages = "1--8",
	author = "Plagwitz, P. and Streit, F. and Becher, A. and Wildermann, S. and Teich, J.",
	abstract = "In order to meet tight performance and/or energy constraints of embedded systems, the implementation of applications in hardware is often a must. However, mapping of algorithms to platforms, as for example Field-Programmable Gate Arrays (FPGAs), still requires comprehensive hardware knowledge and sometimes long design cycles. Modern High-Level Synthesis (HLS) offers a means to ease the generation of hardware implementations from a software specification of an application. Although these tools have improved greatly in recent years, they often do not provide full coverage of important programming constructs and are therefore of limited use when used with existing or automatically generated code. Soft-core processors implemented with FPGA-logic can circumvent this limitation. However, these come with drawbacks in terms of performance and resource requirements as a general-purpose architecture is used to implement the application in software rather than as a highly specialized circuit. As a remedy, our work presents a novel compiler-based synthesis methodology that generates networks of Application-Specific Instruction Set Processors (ASIPs) from unmodified C/C++ algorithms. We thereby bridge the gap between traditional soft-core processors and HLS. To show the practicability of our approach, we present a case study of a JPEG decoder application while investigating design objectives like resource costs and performance. Apart from the generality of the compiler-based approach, our approach also shows better results in terms of required hardware resources and execution times compared to Instruction Set Architecture (ISA)-fixed commercial Xilinx MicroBlaze soft-cores.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-plagwitz2019.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=10{\&}resourceId=32{\&}filename=552494b0dfb7f5f9413c05e6fbdc394c9b41e194:pdf",
	keywords = "application specific integrated circuits;embedded systems;field programmable gate arrays;high level synthesis;image coding;instruction sets;microprocessor chips;program compilers;energy constraints;embedded systems;field-programmable gate arrays;high-level synthesis;HLS;soft-core processors;FPGA-logic;Application-Specific Instruction Set Processors;JPEG decoder application;hardware resources;compiler-based High-Level Synthesis;ASIP;ISA-fixed commercial Xilinx MicroBlaze soft-cores;instruction set architecture;FPGA;HLS;ASIP;SoC;Hardware/Software Co-Design"
}

@inproceedings{Pothineni2010,
	title = "A high-level synthesis flow for custom instruction set extensions for application-specific processors",
	DOI = "10.1109/ASPDAC.2010.5419795",
	year = "2010",
	URL = "https://ieeexplore.ieee.org/abstract/document/5419795",
	booktitle = "2010 15th Asia and South Pacific Design Automation Conference (ASP-DAC)",
	pages = "707--712",
	author = "Pothineni, N. and Brisk, P. and Ienne, P. and Kumar, A. and Paul, K.",
	abstract = "Custom instruction set extensions (ISEs) are added to an extensible base processor to provide application-specific functionality at a low cost. As only one ISE executes at a time, resources can be shared. This paper presents a new high-level synthesis flow targeting ISEs. We emphasize a new technique for resource allocation, binding, and port assignment during synthesis. Our method is derived from prior work on datapath merging, and increases area reduction by accounting for the cost of multiplexors that must be inserted into the resulting datapath to achieve multi-operational functionality.",
	file = "A high-level synthesis flow for custom instruction set extensions for application-specific processors-pothineni2010.pdf:http://blackhole.federationhq.de/wikindxindex.php?action=attachments{\_}ATTACHMENTS{\_}CORE{\&}method=downloadAttachment{\&}id=2{\&}resourceId=24{\&}filename=b0c646e5c7061e429ac742b253c28954b962945a:pdf",
	keywords = "application specific integrated circuits;high level synthesis;instruction sets;microprocessor chips;resource allocation;high-level synthesis flow;custom instruction set extensions;application-specific processors;extensible base processor;resource allocation;port assignment;multioperational functionality;High level synthesis;Application specific processors;Delay;Clocks;Processor scheduling;Resource management;Registers;Computer science;Cost function;Merging"
}