|
Message
From: cvs at opencores.org<cvs@o...>
Date: Sat May 26 20:43:01 CEST 2007
Subject: [cvs-checkins] MODIFIED: jop ...
Date: 00/07/05 26:20:43 Added: jop/doc/book/appendix acronyms.tex bytecode.tex bytetable.tex instruction_set.tex Makefile microcode.csv timing.tex Log: start a user manual Revision Changes Path 1.1 jop/doc/book/appendix/acronyms.tex http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/acronyms.tex?rev=1.1&content-type=text/x-cvsweb-markup Index: acronyms.tex =================================================================== \newcommand{\gloss}[3]{ \textbf{#1} & #2\\ % \item[#1] {#2}\\{#3} % \textbf{#1} & #2\\ & \mbox{#3}\\ % \textbf{#1} \> \> #2\\ \> \parbox{10cm}{#3}\\ \\ } \begin{longtable}[l]{ll} % abc\=WCET\= \kill % for tabbing \gloss{ADC}{Analog to Digital Converter}{} \gloss{ALU}{Arithmetic and Logic Unit}{The part of a processor that performs arithmetic, logical, and related operations.} \gloss{ASIC}{Application-Specific Integrated Circuit}{} \gloss{BCET}{Best Case Execution Time}{} \gloss{CFG}{Control Flow Graph}{} \gloss{CISC}{Complex Instruction Set Computer}{} \gloss{CLDC}{Connected Limited Device Configuration}{} \gloss{CPI}{average Clock cycles Per Instruction}{} \gloss{CRC}{Cyclic Redundancy Check}{} \gloss{DMA}{Direct Memory Access}{} \gloss{DRAM}{Dynamic Random Access Memory}{} \gloss{EDF}{Earliest Deadline First}{} \gloss{EMC}{Electromagnetic Compatibility}{} \gloss{ESD}{Electrostatic Discharge}{} \gloss{FIFO}{Fist In, First Out}{} \gloss{FPGA}{Field Programmable Gate Array}{FPGAs are a class of programmable logic devices. They contain a matrix of LCs, embedded memory blocks, and sophisticated I/O cells.} \gloss{GC}{Garbage Collect(ion/or)}{} \gloss{IC}{Instruction Count}{} \gloss{ILP}{Instruction Level Parallelism}{} \gloss{JOP}{Java Optimized Processor}{A processor that implements the JVM in hardware with architectural features for time-predictable execution of Java applications for real-time systems.} \gloss{J2ME}{Java2 Micro Edition}{} \gloss{J2SE}{Java2 Standard Edition}{} \gloss{JDK}{Java Development Kit}{} \gloss{JIT}{Just-In-Time}{} \gloss{JVM}{Java Virtual Machine}{} \gloss{LC}{Logic Cell}{The basic element in an FPGA: a 4-bit lookup table with a register.} \gloss{LRU}{Least-Recently Used}{} \gloss{MBIB}{Memory Bytes read per Instruction Byte}{} \gloss{MCIB}{Memory Cycles per Instruction Byte}{} \gloss{MP}{Miss Penalty}{} \gloss{MTIB}{Memory Transactions per Instruction Byte}{} \gloss{MUX}{Multiplexer}{} \gloss{OO}{Object Oriented}{} \gloss{OS}{Operating System}{} \gloss{RISC}{Reduced Instruction Set Computer}{} \gloss{RT}{Real-Time}{} \gloss{RTOS}{Real-Time Operating System}{} \gloss{RTSJ}{Real-Time Specification for Java}{} \gloss{SCADA}{Supervisory Control And Data Acquisition}{} \gloss{SDRAM}{Synchronous DRAM}{} \gloss{SRAM}{Static Random Access Memory}{} \gloss{TOS}{Top Of Stack}{} \gloss{UART}{Universal Asynchronous Receiver/Transmitter}{} \gloss{VHDL}{Very High Speed Integrated Circuit (VHSIC)}{} \gloss{}{Hardware Description Language}{} \gloss{WCET}{Worst-Case Execution Time}{} % \gloss{xxx}{yyy}{} % \gloss{xxx}{yyy}{} % \gloss{xxx}{yyy}{} \end{longtable} 1.1 jop/doc/book/appendix/bytecode.tex http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/bytecode.tex?rev=1.1&content-type=text/x-cvsweb-markup Index: bytecode.tex =================================================================== 0 & nop & hw & 1 \\ 1 & aconst\_null & hw & 1 \\ 2 & iconst\_m1 & hw & 1 \\ 3 & iconst\_0 & hw & 1 \\ 4 & iconst\_1 & hw & 1 \\ 5 & iconst\_2 & hw & 1 \\ 6 & iconst\_3 & hw & 1 \\ 7 & iconst\_4 & hw & 1 \\ 8 & iconst\_5 & hw & 1 \\ 9 & lconst\_0 & mc & 2 \\ 10 & lconst\_1 & mc & 2 \\ 11 & fconst\_0 & Java & \\ 12 & fconst\_1 & Java & \\ 13 & fconst\_2 & Java & \\
14 & dconst\_0 & - & \\
15 & dconst\_1 & - & \\
16 & bipush & mc & 2 \\
17 & sipush & mc & 3 \\
18 & ldc & mc & 7+r \\
19 & ldc\_w & mc & 8+r \\
20 & ldc2\_w\footnotemark[20] & mc & 17+2*r \\
21 & iload & mc & 2 \\
22 & lload & mc & 11 \\
23 & fload & mc & 2 \\
24 & dload & mc & 11 \\
25 & aload & mc & 2 \\
26 & iload\_0 & hw & 1 \\
27 & iload\_1 & hw & 1 \\
28 & iload\_2 & hw & 1 \\
29 & iload\_3 & hw & 1 \\
30 & lload\_0 & mc & 2 \\
31 & lload\_1 & mc & 2 \\
32 & lload\_2 & mc & 2 \\
33 & lload\_3 & mc & 11 \\
34 & fload\_0 & hw & 1 \\
35 & fload\_1 & hw & 1 \\
36 & fload\_2 & hw & 1 \\
37 & fload\_3 & hw & 1 \\
38 & dload\_0 & mc & 2 \\
39 & dload\_1 & mc & 2 \\
40 & dload\_2 & mc & 2 \\
41 & dload\_3 & mc & 11 \\
42 & aload\_0 & hw & 1 \\
43 & aload\_1 & hw & 1 \\
44 & aload\_2 & hw & 1 \\
45 & aload\_3 & hw & 1 \\
%46 & iaload\footnotemark[46] & mc & 32+3*r \\
46 & iaload\footnotemark[46] & mc & 7+3*r \\
47 & laload & mc & 43+4*r \\
48 & faload\footnotemark[46] & mc & 7+3*r \\
49 & daload & - & \\
50 & aaload\footnotemark[46] & mc & 7+3*r \\
51 & baload\footnotemark[46] & mc & 7+3*r \\
52 & caload\footnotemark[46] & mc & 7+3*r \\
53 & saload\footnotemark[46] & mc & 7+3*r \\
54 & istore & mc & 2 \\
55 & lstore & mc & 11 \\
56 & fstore & mc & 2 \\
57 & dstore & mc & 11 \\
58 & astore & mc & 2 \\
59 & istore\_0 & hw & 1 \\
60 & istore\_1 & hw & 1 \\
61 & istore\_2 & hw & 1 \\
62 & istore\_3 & hw & 1 \\
63 & lstore\_0 & mc & 2 \\
64 & lstore\_1 & mc & 2 \\
65 & lstore\_2 & mc & 2 \\
66 & lstore\_3 & mc & 11 \\
67 & fstore\_0 & hw & 1 \\
68 & fstore\_1 & hw & 1 \\
69 & fstore\_2 & hw & 1 \\
70 & fstore\_3 & hw & 1 \\
71 & dstore\_0 & mc & 2 \\
72 & dstore\_1 & mc & 2 \\
73 & dstore\_2 & mc & 2 \\
74 & dstore\_3 & mc & 11 \\
75 & astore\_0 & hw & 1 \\
76 & astore\_1 & hw & 1 \\
77 & astore\_2 & hw & 1 \\
78 & astore\_3 & hw & 1 \\
%79 & iastore\footnotemark[79] & mc & 35+2*r+w \\
79 & iastore\footnotemark[79] & mc & 9+2*r+w \\
80 & lastore\footnotemark[1] & mc & 48+2*r+2*w \\
81 & fastore\footnotemark[79] & mc & 9+2*r+w \\
82 & dastore & - & \\
83 & aastore\footnotemark[79] & mc & 9+2*r+w \\
84 & bastore\footnotemark[79] & mc & 9+2*r+w \\
85 & castore\footnotemark[79] & mc & 9+2*r+w \\
86 & sastore\footnotemark[79] & mc & 9+2*r+w \\
87 & pop & hw & 1 \\
88 & pop2 & mc & 2 \\
89 & dup & hw & 1 \\
90 & dup\_x1 & mc & 5 \\
91 & dup\_x2 & mc & 7 \\
92 & dup2 & mc & 6 \\
93 & dup2\_x1 & mc & 8 \\
94 & dup2\_x2 & mc & 10 \\
95 & swap\footnotemark[2] & mc & 4 \\
96 & iadd & hw & 1 \\
97 & ladd & Java & \\
98 & fadd & Java & \\
99 & dadd & - & \\
100 & isub & hw & 1 \\
101 & lsub & Java & \\
102 & fsub & Java & \\
103 & dsub & - & \\
104 & imul & mc & 35 \\
105 & lmul & Java & \\
106 & fmul & Java & \\
107 & dmul & - & \\
108 & idiv & Java & \\
109 & ldiv & Java & \\
110 & fdiv & Java & \\
111 & ddiv & - & \\
112 & irem & Java & \\
113 & lrem & Java & \\
114 & frem & Java & \\
115 & drem & - & \\
116 & ineg & mc & 4 \\
117 & lneg & Java & \\
118 & fneg & Java & \\
119 & dneg & - & \\
120 & ishl & hw & 1 \\
121 & lshl & Java & \\
122 & ishr & hw & 1 \\
123 & lshr & Java & \\
124 & iushr & hw & 1 \\
125 & lushr & Java & \\
126 & iand & hw & 1 \\
127 & land & Java & \\
128 & ior & hw & 1 \\
129 & lor & Java & \\
130 & ixor & hw & 1 \\
131 & lxor & Java & \\
132 & iinc & mc & 8 \\
133 & i2l & Java & \\
134 & i2f & Java & \\
135 & i2d & - & \\
136 & l2i & mc & 3 \\
137 & l2f & - & \\
138 & l2d & - & \\
139 & f2i & Java & \\
140 & f2l & - & \\
141 & f2d & - & \\
142 & d2i & - & \\
143 & d2l & - & \\
144 & d2f & - & \\
145 & i2b & Java & \\
146 & i2c & mc & 2 \\
147 & i2s & Java & \\
148 & lcmp & Java & \\
149 & fcmpl & Java & \\
150 & fcmpg & Java & \\
151 & dcmpl & - & \\
152 & dcmpg & - & \\
153 & ifeq & mc & 4 \\
154 & ifne & mc & 4 \\
155 & iflt & mc & 4 \\
156 & ifge & mc & 4 \\
157 & ifgt & mc & 4 \\
158 & ifle & mc & 4 \\
159 & if\_icmpeq & mc & 4 \\
160 & if\_icmpne & mc & 4 \\
161 & if\_icmplt & mc & 4 \\
162 & if\_icmpge & mc & 4 \\
163 & if\_icmpgt & mc & 4 \\
164 & if\_icmple & mc & 4 \\
165 & if\_acmpeq & mc & 4 \\
166 & if\_acmpne & mc & 4 \\
167 & goto & mc & 4 \\
168 & jsr & \emph{not used} & \\
169 & ret & \emph{not used} & \\
170 & tableswitch\footnotemark[170] & Java & \\
171 & lookupswitch\footnotemark[171] & Java & \\
172 & ireturn\footnotemark[172] & mc & 23+r+l \\
173 & lreturn\footnotemark[173] & mc & 25+r+l \\
174 & freturn\footnotemark[172] & mc & 23+r+l \\
175 & dreturn\footnotemark[173] & mc & 25+r+l \\
176 & areturn\footnotemark[172] & mc & 23+r+l \\
177 & return\footnotemark[177] & mc & 21+r+l \\
178 & getstatic & mc & 12+2*r \\
179 & putstatic & mc & 13+r+w \\
180 & getfield & mc & 17+2*r \\
181 & putfield & mc & 20+r+w \\
182 & invokevirtual\footnotemark[182] & mc & 100+4r+l \\
183 & invokespecial\footnotemark[183] & mc & 74+3*r+l \\
184 & invokestatic\footnotemark[183] & mc & 74+3*r+l \\
185 & invokeinterface\footnotemark[185] & mc & 114+6r+l \\
186 & unused\_ba & - & \\
187 & new\footnotemark[187] & Java & \\
188 & newarray\footnotemark[188] & Java & \\
189 & anewarray & Java & \\
190 & arraylength & mc & 6+r \\
191 & athrow\footnotemark[3] & Java & \\
192 & checkcast & Java & \\
193 & instanceof & Java & \\
194 & monitorenter & mc & 11 \\
195 & monitorexit & mc & 10/16 \\
196 & wide & \emph{not used} & \\
197 & multianewarray\footnotemark[4] & Java & \\
198 & ifnull & mc & 4 \\
199 & ifnonnull & mc & 4 \\
200 & goto\_w & \emph{not used} & \\
201 & jsr\_w & \emph{not used} & \\
202 & breakpoint & - & \\
203 & reserved & - & \\
204 & reserved & - & \\
205 & reserved & - & \\
206 & reserved & - & \\
207 & reserved & - & \\
208 & reserved & - & \\
209 & jopsys\_rd\footnotemark[209] & mc & 4+r \\
210 & jopsys\_wr & mc & 5+w \\
211 & jopsys\_rdmem & mc & 4+r \\
212 & jopsys\_wrmem & mc & 5+w \\
213 & jopsys\_rdint & mc & 3 \\
214 & jopsys\_wrint & mc & 3 \\
215 & jopsys\_getsp & mc & 3 \\
216 & jopsys\_setsp & mc & 4 \\
217 & jopsys\_getvp & hw & 1 \\
218 & jopsys\_setvp & mc & 2 \\
219 & jopsys\_int2ext\footnotemark[219] & mc & 14+r+n*(23+w) \\
220 & jopsys\_ext2int\footnotemark[220] & mc & 14+r+n*(23+r) \\
221 & jopsys\_nop & mc & 1 \\
222 & jopsys\_invoke & mc & \\
223 & jopsys\_cond\_move & mc & 5 \\
224 & getstatic\_ref & mc & \\
225 & putstatic\_ref & mc & \\
226 & getfield\_ref & mc & \\
227 & putfield\_ref & mc & \\
228 & getstatic\_long & mc & \\
229 & putstatic\_long & mc & \\
230 & getfield\_long & mc & \\
231 & putfield\_long & mc & \\
232 & reserved & - \\
233 & reserved & - \\
234 & reserved & - \\
235 & reserved & - \\
236 & reserved & - \\
237 & reserved & - \\
238 & reserved & - \\
239 & reserved & - \\
240 & sys\_int\footnotemark[240] & Java \\
241 & sys\_exc\footnotemark[240] & Java \\
242 & reserved & - \\
243 & reserved & - \\
244 & reserved & - \\
245 & reserved & - \\
246 & reserved & - \\
247 & reserved & - \\
248 & reserved & - \\
249 & reserved & - \\
250 & reserved & - \\
251 & reserved & - \\
252 & reserved & - \\
253 & reserved & - \\
254 & sys\_noimp & Java \\
255 & sys\_init & \emph{not used} \\
1.1 jop/doc/book/appendix/bytetable.tex
http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/bytetable.tex?rev=1.1&content-type=text/x-cvsweb-markup
Index: bytetable.tex
===================================================================
%\input{../preamble}
\tablename~\ref{tab:appendix:bytecode} lists the bytecodes of the
JVM with their opcode, mnemonics, the implementation type and the
execution time on JOP. In the implementation column \emph{hw} means
that this bytecode has a microcode equivalent, \emph{mc} means that
a microcode sequence implements the bytecode, \emph{Java} means the
bytecode is implemented in Java, and a `-' indicates that this
bytecode is not yet implemented. For bytecodes with a variable
execution time the minimum and maximum values are given.
\begin{longtable}{rllr}
\toprule
Opcode & Instruction & Implementation & Cycles \\
\midrule
\endhead
\bottomrule
\caption{Implemented bytecodes and execution time in cycles}
\label{tab:appendix:bytecode}
\endfoot
% 18 & ldc & mc & 3+m \\
\input{appendix/bytecode}
\end{longtable}
\footnotetext[1]{The exact value is given below.}
\footnotetext[2]{Not tested as javac does not emit the \code{swap}
bytecode.}
\footnotetext[3]{A simple version that stops the JVM. No catch
support.}
\footnotetext[4]{Only dimension 2 supported.}
\footnotetext[20]{The exact value is
$17+\left\{\begin{array}{r@{\quad:\quad}l}
r-2 & r>2 \\
0 & r\le2
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
r-1 & r>1 \\
0 & r\le1
\end{array} \right.
$
}
\footnotetext[46]{The exact value is
% $19+r+\left\{\begin{array}{r@{\quad:\quad}l}
% r-2 & r\ge6 \\
% 4 & r<6
% \end{array} \right. $
\emph{no hidden wait states at the moment.}
}
\footnotetext[79]{The exact value is
% $22+\left\{\begin{array}{r@{\quad:\quad}l}
% r-2 & r\ge6 \\
% 4 & r<6
% \end{array} \right.
% +w
% $
\emph{no hidden wait states at the moment.}
}
\footnotetext[170]{\codefoot{tableswitch} execution time depends to
a great extent on the caching of the corresponding Java method or
the memory transfer time for the method.}
\footnotetext[171]{\codefoot{lookupswitch} execution time depends to
a great extent on the caching of the corresponding Java method or
the memory transfer time for the method. \codefoot{lookupswitch}
also depends on the argument as it performs a linear search in the
jump table.}
%172 & ireturn & mc & 23+r+b\footnotemark[172] \\
\footnotetext[172]{The exact value is:
$
23+\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
% the saved cycles are counted from the instruction after stbcrd
% up to and including the last wait
\left\{\begin{array}{r@{\quad:\quad}l}
l-10 & l>10 \\
0 & l\le10
\end{array} \right.
$
}
%173 & lreturn & mc & 25+r+b\footnotemark[173] \\
\footnotetext[173]{The exact value is:
$
25+\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
l-11 & l>11 \\
0 & l\le11
\end{array} \right.
$
}
%177 & return & mc & 21+r+b\footnotemark[177] \\
\footnotetext[177]{ The exact value is:
$
21+\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
l-9 & l>9 \\
0 & l\le9
\end{array} \right.
$
}
%182 & invokevirtual & mc & 100+4r+b\footnotemark[182] \\
\footnotetext[182]{The exact value is:
$
100+2r+
\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
r-2 & r>2 \\
0 & r\le2
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
l-37 & l>37 \\
0 & l\le37
\end{array} \right.
$
}
%183 & invokespecial & mc & 74+3r+b\footnotemark[182] \\
%184 & invokestatic & mc & 74+3r+b\footnotemark[182] \\
\footnotetext[183]{The exact value is:
$
74+r+
\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
r-2 & r>2 \\
0 & r\le2
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
l-37 & l>37 \\
0 & l\le37
\end{array} \right.
$
}
%185 & invokeinterface & mc & 114+6r+b\footnotemark[182] \\
\footnotetext[185]{The exact value is:
$
114+4r+
\left\{\begin{array}{r@{\quad:\quad}l}
r-3 & r>3 \\
0 & r\le3
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
r-2 & r>2 \\
0 & r\le2
\end{array} \right.
+
\left\{\begin{array}{r@{\quad:\quad}l}
l-37 & l>37 \\
0 & l\le37
\end{array} \right.
$
}
\footnotetext[187]{\codefoot{new} execution time depends to a great
extent on the caching of the corresponding Java method or the memory
transfer time for the method. \codefoot{new} also depends on the
size of the created object as the memory for the object is filled
with zeros -- This will change with the GC}
%188 & newarray & mc & 12+w-7\footnotemark[188] \\
\footnotetext[188]{\codefoot{newarray} execution time depends to a
great extent on the caching of the corresponding Java method or the
memory transfer time for the method. \codefoot{newarray} also
depends on the size of the array as the memory for the object is
filled with zeros -- This will change with the GC}
\footnotetext[209]{The native instructions \codefoot{jopsys\_rd} and
\codefoot{jopsys\_wr} are alias to the \codefoot{jopsys\_rdmem} and
\codefoot{jopsys\_wrmem} instructions just for compatibility to
existing Java code. IO devices are now memory mapped. In the case
for simple IO devices there are no wait states and the exact values
are 4 and 5 cycles respective.}
%14+r+n*(23+w)
\footnotetext[219]{The exact value is
$14+r+n(23+\left\{\begin{array}{r@{\quad:\quad}l}
w-8 & w>8 \\
0 & w\le8
\end{array} \right. )$.
$n$ is the number of words transferred.}
%14+r+n*(23+w)
\footnotetext[220]{The exact value is
$14+r+n(23+\left\{\begin{array}{r@{\quad:\quad}l}
r-10 & r>10 \\
0 & r\le10
\end{array} \right. )$.
$n$ is the number of words transferred.}
\footnotetext[240]{\emph{Is the interrupt and the exception still a
bytecode or is it now inserted just as microcode address?}}
\subsection*{Memory Timing}
The external memory timing is defined in the top level VHDL file
(e.g.\ \code{jopcyc.vhd}) with \code{ram\_cnt} for the number of
cycles for a read and write access. At the moment there is no
difference for a read and write access. For the 100MHz JOP with 15ns
SRAMs this access time is two cycles (\code{ram\_cnt}=2, 20ns).
Therefore the wait state $n_{ws}$ is 1 (\code{ram\_cnt-1}).
%
A basic memory read in microcode is as follows:
\begin{verbatim}
stmra // start read with address store
wait // fill the pipeline with two
wait // wait instructions
ldmrd // push read result on TOS
\end{verbatim}
%
In this sequence the \emph{last} \code{wait} executes for $1+n_{ws}$
cycles. Therefore the whole read sequence takes $4+n_{ws}$ cycles.
For the example with \code{ram\_cnt}=2 this basic memory read takes
5 cycles.
A memory write in microcode is as follows:
\begin{verbatim}
stmwa // store address
stmwd // store data and start the write
wait // fill the pipeline with wait
wait // wait for the memory ready
\end{verbatim}
The last wait again executes for $1+n_{ws}$ cycles and the basic
write takes $4+n_{ws}$ cycles. For the native bytecode \code
{jopsys\_wrmem} an additional \code{nop} instruction for the
\code{nxt} flag is necessary.
The read and write wait states $r_{ws}$ and $w_{ws}$ are:
\begin{equation*}
r_{ws} = w_{ws} =
\left\{\begin{array}{r@{\quad:\quad}l}
ram\_cnt-1 & ram\_cnt>1 \\
0 & ram\_cnt\le1
\end{array} \right.
\end{equation*}
In the instruction timing we use $r$ and $w$ instead of $r_{ws}$ and
$w_{ws}$. The wait states can be hidden by other microcode
instructions between \code{stmra/wait} and \code{stmwd/wait}. The
exact value is given in the footnote.
\subsection*{Instruction Timing}
The bytecodes that access memory are indicated by an $r$ for a
memory read and an $w$ for a memory write at the cycles column ($r$
and $w$ are the additional wait states). The wait cycles for the
memory access have to be added to the execution time. These two
values are implementation dependent (clock frequency versus RAM
access time, data bus width); for the Cyclone EP1C6 board with 15ns
SRAMs and 100MHz clock frequency these values are both 1 cycle
(\code{ram\_cnt}-1).
For some bytecodes, part of the memory latency can be hidden by
executing microcode during the memory access. However, these cycles
can only be subtracted when the wait states (\emph{r} or \emph{w})
are larger then 0 cycles. The exact execution time with the
subtraction of the saved cycles is given in the footnote.
\subsubsection*{Cache Load}
For the method cache load the cache wait state $c_{ws}$ is:
\begin{equation*}
c_{ws} =
\left\{\begin{array}{r@{\quad:\quad}l}
r_{ws}-1 & r_{ws}>1 \\
0 & r_{ws}\le1
\end{array} \right.
\end{equation*}
On a method invoke or return the bytecode has to be loaded into the
cache on a cache miss. The load time $l$ is:
\[
l =
\left\{\begin{array}{r@{\quad:\quad}l}
6+(n+1)(2+c_{ws}) & \mbox{cache miss} \\
4 & \mbox{cach hit}
\end{array} \right.
\]
with $n$ as the length of the method in number of 32-bit words. For
short methods the load time of the method on a cache miss, or part
of it, is hidden by microcode execution. The exact value is given in
the footnote.
% We count the hidden cycles in the same way as for a read or write:
% the instructions between stbcr and the first wait
%
\subsubsection*{lastore}
% 48+2*r+2*w
\begin{equation*}
t_{lastore} = 48+2r_{ws}+w_{ws} + \left\{\begin{array}{r@{\quad:\quad}l}
w_{ws}-3 & w_{ws}>3 \\
0 & w_{ws}\le3
\end{array} \right.
\end{equation*}
\subsubsection*{get/putfield/ref/long}
TODO: add different values for 32-bit, 64-bit and reference type.
%\end{document}
1.1 jop/doc/book/appendix/instruction_set.tex
http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/instruction_set.tex?rev=1.1&content-type=text/x-cvsweb-markup
Index: instruction_set.tex
===================================================================
%\input{../preamble}
The instruction set of JOP, the so-called microcode, is described in
this appendix. Each instruction consists of a single instruction
word (8 bits) without extra operands and executes in a single
cycle\footnote{The only multicycle instruction is \codefoot{wait}
and depends on the access time of the external memory}.
\tablename~\ref{tab:appendix:hwreg} lists the registers and internal
memory areas that are used in the dataflow description.
\begin{table}[h]
\centering
\begin{tabular}{ll}
\toprule
Name & Description \\
\midrule
A & Top of the stack\\
B & The element one below the top of stack\\
stack[] & The stack buffer for the rest of the stack\\
sp & The stack pointer for the stack buffer\\
vp & The variable pointer. Points to the first local in
the stack buffer\\
ar & Address register for indirect stack access\\
pc & Microcode program counter\\
offtbl & Table for branch offsets\\
jpc & Program counter for the Java bytecode\\
opd & 8 bit operand from the bytecode fetch unit\\
opd$_{16}$ & 16 bit operand from the bytecode fetch unit\\
ioar & Address register of the IO subsystem\\
memrda & Read address register of the memory subsystem\\
memwra & Write address register of the memory subsystem\\
memrdd & Read data register of the memory subsystem\\
memwrd & Write data register of the memory subsystem\\
mula, mulb & Operands of the hardware multiplier\\
mulr & Result register of the hardware multiplier\\
membcr & Bytecode address and length register of the memory
subsystem\\
bcstart & Method start address register in the method cache\\
\bottomrule
\end{tabular}
\caption{JOP hardware registers and memory areas}\label{tab:appendix:hwreg}
\end{table}
\clearpage
\input{appendix/microcode}
%\end{document}
1.1 jop/doc/book/appendix/Makefile
http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/Makefile?rev=1.1&content-type=text/x-cvsweb-markup
Index: Makefile
===================================================================
all:
cd tools && javac *.java
java -cp tools Csv2Latex microcode.csv > microcode.tex
pdfLatex timing
clean:
-cd tools && rm *.class
-rm microcode.tex
-rm *.aux
-rm *.log
-rm *.out
-rm *.bak
-rm *.pdf
1.1 jop/doc/book/appendix/microcode.csv
http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/microcode.csv?rev=1.1&content-type=text/x-cvsweb-markup
Index: microcode.csv
===================================================================
pop;Pop the top operand stack value;{00000000};B\to A \\ stack[sp] \to B \\sp-1 \to sp;pop;Pop the top value from the operand stack.
and;Boolean AND \code{int};{00000001};A \wedge B\to A \\ stack[sp] \to B \\sp-1 \to sp;iand;Build the bitwise AND (conjunction) of the two top elements of the stack and push back the result onto the operand stack.
or;Boolean OR \code{int};{00000010};A \vee B\to A \\ stack[sp] \to B \\sp-1 \to sp;ior;Build the bitwise inclusive OR (disjunction) of the two top elements of the stack and push back the result onto the operand stack.
xor;Boolean XOR \code{int};{00000011};A \not\equiv B\to A \\ stack[sp] \to B \\sp-1 \to sp;ixor;Build the bitwise exclusive OR (negation of equivalence) of the two top elements of the stack and push back the result onto the operand stack.
add;Add \code{int};{00000100};A+B\to A \\ stack[sp] \to B \\sp-1 \to sp;iadd;Add the two top elements from the stack and push back the result onto the operand stack.
sub;Subtract \code{int};{00000101};A-B\to A \\ stack[sp] \to B \\sp-1 \to sp;isub;Subtract the two top elements from the stack and push back the result onto the operand stack.
stmra;Store memory read address;{00001000};A \to memrda \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the stack is stored as read address in the memory subsystem. This operation starts the concurrent memory read. The processor can continue with other operations. When the datum is needed a \code{wait} instruction stalls the processor till the read access is finished. The value is read with \code{ldmrd}.
stmwa;Store memory write address;{00001001};A \to memwra \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the stack is stored as write address in the memory subsystem for a following \code{stmwd}.
stmwd;Store memory write data;{00001010};A \to memwrd \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the stack is stored as write data in the memory subsystem. This operation starts the concurrent memory write The processor can continue with other operations. The \code{wait} instruction stalls the processor till the write access is finished.
stald;Start array load;{00001011};A \to memidx \\ B\to A \\ B\to memptr \\ stack[sp] \to B \\sp-1 \to sp;xaload;The top value from the stack is stored as array index, the next as reference in the memory subsystem. This operation starts the concurrent array load. The processor can continue with other operations. The \code{wait} instruction stalls the processor till the read access is finished. A null pointer or out of bounds exception is generated by the memory subsystem and thrown at the next bytecode fetch.
stast;Start array store;{00001100};A \to memval\\ B\to A \\ stack[sp] \to B \\sp-1 \to sp\\ next cycle \\ A \to memidx \\ B\to A \\ B\to memptr \\ stack[sp] \to B \\sp-1 \to sp;xastore;In the first cycle the top value from the stack is stored as value into the memory subsystem. A microcode \code{pop} hast to follow. In the second cycle the top value from the stack is stored as array index, the next as reference in the memory subsystem. This operation starts the concurrent array store. The processor can continue with other operations. The \code{wait} instruction stalls the processor till the write access is finished. A null pointer or out of bounds exception is generated by the memory subsystem and thrown at the next bytecode fetch.
stmul;Multiply \code{int};{00001101};A \to mula \\ B \to mulb \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the stack is stored as first operand for the multiplier. The value one below the top of stack is stored as second operand for the multiplier. This operation starts the multiplier. The result is read with the \code{ldmul} instruction.
stbcrd;Start bytecode read;{00001111};A \to membcr \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the stack is stored as address and length of a method in the memory subsystem. This operation starts the memory transfer from the main memory to the bytecode cache (DMA). The processor can continue with other operations. The \code{wait} instruction stalls the processor till the transfer has finished. No other memory accesses are allowed during the bytecode read.
st$<$n$>$;Store 32-bit word into local variable;{000100nn};A \to stack[vp+n] \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;astore_$<$n$>$, istore_$<$n$>$, fstore_$<$n$>$;The value on the top of the operand stack is popped and stored in the local variable at position $n$.
st;Store 32-bit word into local variable;{00010100};A \to stack[vp+opd] \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;astore, istore, fstore;The value on the top of the operand stack is popped and stored in the local variable at position $opd$. $opd$ is taken from the bytecode instruction stream.
stmi;Store in local memory indirect;{00010101};A \to stack[ar] \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the operand stack is stored in the local memory (stack) at position ar.
stvp;Store variable pointer;{00011000};A \to vp \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The value on the top of the operand stack is popped and stored in the variable pointer (\code{vp}).
stjpc;Store Java program counter;{00011001};A \to jpc \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The value on the top of the operand stack is popped and stored in the Java program counter (\code{jpc}).
star;Store adress register;{00011010};A \to ar \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The value on the top of the operand stack is popped and stored in the address register (\code{ar}). Due to a pipeline delay the register is valid on cycle later for usage by \code{ldmi} and \code{stmi}.
stsp;Store stack pointer;{00011011};A \to sp \\ B\to A \\ stack[sp] \to B;--;The value on the top of the operand stack is popped and stored in the stack pointer (\code{sp}).
ushr;Logical shift rigth \code{int};{00011100};B >>> A \to A \\ stack[sp] \to B \\sp-1 \to sp;iushr;The values are popped from the operand stack. An \code{int} result is calculated by shifting the TOS-1 value rigth by $s$ position, with zero extension, where $s$ is the value of the low 5 bits of the TOS. The result is pushed onto the operand stack.
shl;Shift left \code{int};{00011101};B << A \to A \\ stack[sp] \to B \\sp-1 \to sp;ishl;The values are popped from the operand stack. An \code{int} result is calculated by shifting the TOS-1 value left by $s$ position, where $s$ is the value of the low 5 bits of the TOS. The result is pushed onto the operand stack.
shr;Arithmetic shift rigth \code{int};{00011110};B >> A \to A \\ stack[sp] \to B \\sp-1 \to sp;ishr;The values are popped from the operand stack. An \code{int} result is calculated by shifting the TOS-1 value rigth by $s$ position, with sign extension, where $s$ is the value of the low 5 bits of the TOS. The result is pushed onto the operand stack.
stm;Store in local memory;{001nnnnn};A \to stack[n] \\ B\to A \\ stack[sp] \to B \\sp-1 \to sp;--;The top value from the operand stack is stored in the local memory (stack) at position n. These 32 memory destinations represent microcode local variables.
bz;Branch if value is zero;{010nnnnn};\mbox{if}\:\: A = 0 \:\: \mbox{then} \:\: pc+offtbl[n]+2 \to pc \\ B \to A \\ stack[sp] \to B \\sp-1 \to sp;--;If the top value from the operand stack is zero a microcode branch is taken. The value is popped from the operand stack. Due to a pipeline delay, the zero flag is delayed one cycle, i.e.\ the value from the last but one instruction is taken. The branch is followed by two branch delay slots. The branch offset is taken from the table $offtbl$ indexed by $n$.
bnz;Branch if value is not zero;{011nnnnn};\mbox{if}\:\: A \not= 0 \:\: \mbox{then} \:\: pc+offtbl[n]+2 \to pc \\ B \to A \\ stack[sp] \to B \\sp-1 \to sp;--;If the top value from the operand stack is not zero a microcode branch is taken. The value is popped from the operand stack. Due to a pipeline delay, the zero flag is delayed one cycle, i.e.\ the value from the last but one instruction is taken. The branch is followed by two branch delay slots. The branch offset is taken from the table $offtbl$ indexed by $n$.
nop;Do nothing;{10000000};-;nop;The famous no operation instruction.
wait;Wait for memory completion;{10000001};-;--;This instruction stalls the processor until a pending memory instruction (\code{stmra}, \code{stmwd} or \code{stbcrd}) has completed. Two consecutive \code{wait} instructions are necessary for a correct stall of the decode and execute stage.
jbr;Conditional bytecode branch and goto;{10000010};-;ifnull, ifnonnull, ifeq, ifne, iflt, ifge, ifgt, ifle, if_acmpeq, if_acmpne, if_icmpeq, if_icmpne, if_icmplt, if_icmpge, if_icmpgt, if_icmple, goto;Execute a bytecode branch or goto. The branch condition and offset are calculated in the bytecode fetch unit. Arguments must be removed with \code{pop} instructions in the following microcode instructions.
ldm;Load from local memory;{101nnnnn};stack[n] \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The value from the local memory (stack) at position $n$ is pushed onto the operand stack. These 32 memory destinations represent microcode local variables.
ldi;Load from local memory;{110nnnnn};stack[n+32] \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The value from the local memory (stack) at position $n+32$ is pushed onto the operand stack. These 32 memory destinations represent microcode constants.
ldmrd;Load memory read data;{11100010};memrdd \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The value from the memory system after a memory read is pushed onto the operand stack. This operation is usually preceded by two \code{wait} instructions.
ldmul;Load multiplier result;{11100101};mulr \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;(imul);The result of the multiplier is pushed onto the operand stack.
ldbcstart;Load method start;{11100111};bcstart \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The method start address in the method cache is pushed onto the operand stack.
ld$<$n$>$;Load 32-bit word from local variable;{111010nn};stack[vp+n] \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;aload_$<$n$>$, iload_$<$n$>$, fload_$<$n$>$;The local variable at position $n$ is pushed onto the operand stack.
ld;Load 32-bit word from local variable;{11101100};stack[vp+opd] \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;aload, iload, fload;The local variable at position $opd$ is pushed onto the operand stack. $opd$ is taken from the bytecode instruction stream.
ldmi;Load from local memory indirect;{11101101};stack[ar] \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The value from the local memory (stack) at position ar is pushed onto the operand stack.
ldsp;Load stack pointer;{11110000};sp \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The stack pointer is pushed onto the operand stack.
ldvp;Load variable pointer;{11110001};vp \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The variable pointer is pushed onto the operand stack.
ldjpc;Load Java program counter;{11110010};jpc \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;The Java program counter is pushed onto the operand stack.
ld_opd_8u;Load 8-bit bytecode operand unsigned;{11110100};opd \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;A single byte from the bytecode stream is pushed as \code{int} onto the operand stack.
ld_opd_8s;Load 8-bit bytecode operand signed;{11110101};opd \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;(bipush);A single byte from the bytecode stream is sign-extended to an \code{int} and pushed onto the operand stack.
ld_opd_16u;Load 16-bit bytecode operand unsigned;{11110110};opd_{16} \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;--;A 16-bit word from the bytecode stream is pushed as \code{int} onto the operand stack.
ld_opd_16s;Load 16-bit bytecode operand signed;{11110111};opd_{16} \to A \\ A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;(sipush);A 16-bit word from the bytecode stream is sign-extended to an \code{int} and pushed onto the operand stack.
dup;Duplicate the top operand stack value;{11111000};A \to B \\ B \to stack[sp+1] \\sp+1 \to sp;dup;Duplicate the top value on the operand stack and push it onto the operand stack.
1.1 jop/doc/book/appendix/timing.tex
http://www.opencores.org/cvsweb.shtml/jop/doc/book/appendix/timing.tex?rev=1.1&content-type=text/x-cvsweb-markup
Index: timing.tex
===================================================================
\input{../preamble}
\begin{document}
\chapter{Bytecode Execution Time}
\subimport{../}{bytetable}
\chapter{JOP Instruction Set} \label{appx:jop:instr}
\subimport{../}{instruction_set}
\end{document}
|
 |