I am working on a lexical Analyzer that will read Cm assembly and create tokens, and for the most part it is going perfectly. I have but 2 small problems. First and foremost is that I am having difficulty differentiating between Machine code, and line number. As I have it right not, if the first character is a digit, and it, and the length of the string is <2, I assign it as a line number. The issue here, is that that is also the description for the Machine Code. In actuality, the line number can surpass 2, and the machine code doesn't necessarily have to start with a digit, since it is in hexadecimal. I am having a great difficulty coming up with conditions that will satisfy these. The second issue I am having is a very minor one, when it comes to comments, it will read every character except the last one. Please find a sample input as well as my code below. Thank you all very much for the help, it is more than greatly appreciated!
import java.util.List;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
public class Lexer
{
private static File srcFile = null ;
private static String srcFilename = "<srcFilename>";
public static enum Type
{
//This Assembly code has 5 token types (I am a little confused about this, the documentation released does not specify or at least
//go into great detail about about addresses and offsets, and even mentions that they are excluded from Cm Assembly, so I'm not sure
//what to do for those
MNUMONIC_NAMES, LABELS, ADDRESSES, OFFSETS, COMMENTS, LINE_NUMBER;
}
//This class creates an object of type token
public static class Token
{
public final Type t;
public final String c;
//constructor (set type)
public Token(Type t, String c)
{
this.t = t;
this.c = c;
}
//toString
public String toString()
{
if(t == Type.MNUMONIC_NAMES)
{
return "MNUMONIC_NAMES<" + c + ">";
}
if(t == Type.LABELS)
{
return "LABELS<" + c + ">";
}
if(t == Type.ADDRESSES)
{
return "ADDRESSES<" + c + ">";
}
if(t == Type.OFFSETS)
{
return "OFFSETS<" + c + ">";
}
if(t == Type.COMMENTS)
{
return "COMMENTS<" + c + ">";
}
if(t == Type.LINE_NUMBER)
{
return "LINE_NUMBER<" + c + ">";
}
return t.toString();
}
}
//Given a String and an index, get the word/atom starting at index (this will be used for mnemonics and labels,
//since they are both single words
public static String getAtom(String s, int i)
{
int j = i;
for(;j < s.length(); )
{
//while character is a letter, continue
if(Character.isLetter(s.charAt(j)) || Character.isDigit(s.charAt(j)) )
{
j++;
}
else
{
return s.substring(i, j);
}
}
return s.substring(i,j);
}
//Given a String and an index, get the comment starting at index
//I am having trouble with this one, it returns every word except the last letter of the last word
//If I remove the -1 from the length, it goes out of bounds.
public static String getComment(String s, int i)
{
return s.substring( i , (s.length()-1) );
}
//method creates and returns a List of Tokens
public static List<Token> lex(String input)
{
List<Token> result = new ArrayList<Token>();
for(int i = 0;i < input.length();)
{
switch(input.charAt(i))
{
//case comment
case ';':
//System.out.println(input);
String comment = getComment(input, i);
i+=comment.length();
result.add(new Token(Type.COMMENTS, comment));
//if it is a number
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
String number = getAtom(input, i);
i+=number.length();
if(number.length() < 4)
{
result.add(new Token(Type.LINE_NUMBER, number));
}
else
{
result.add(new Token(Type.ADDRESSES, number));
}
//if not a comment it is either a label or mnemonic (still uncertain about addresses and offsets)
default:
//if white space, continue
if(Character.isWhitespace(input.charAt(i)))
{
i++;
}
//now check for the atom
else
{
String atom = getAtom(input, i);
i += atom.length();
//check if it is a mnemonic (as specified by doc 3, pages 7 -> 8)
if(Check.isMnem(atom))
{
result.add(new Token(Type.MNUMONIC_NAMES, atom));
}
//else it is a label
else
{
result.add(new Token(Type.LABELS, atom));
}
}
break;
}
}
//return list
return result;
}
//main just to test functionality, will run in terminal/command line
public static void main(String[] args) throws IOException {
/*
if(args.length < 1) {
System.out.println("Usage: java Lexer \"((some Scheme) (code to) lex)\".");
return;
}
if (args[0] != null)
{
//check <src>
srcFilename = args[0];
System.out.println("charcount: srcFilename '" + srcFilename + "'");
srcFile = new File(srcFilename);
if(!srcFile.canRead())
{
System.out.println("charcount: cannot open srcFile '" + srcFilename + "'");
return;
}
}
else
{
System.out.println("charcount: [OK] srcFilename = '" + srcFilename + "'");
}
*/
srcFilename = "C:\\Users\\abdcg\\Desktop\\School\\Concordia\\Semester 4\\SOEN 341\\Project B\\Sprint 1\\Lexer test\\Test 2.txt";
srcFile = new File(srcFilename);
//Scanner scanny = new Scanner(srcFile);
FileReader fr = new FileReader(srcFile);
BufferedReader br = new BufferedReader(fr);
String line;
while((line = br.readLine()) != null)
{
List<Token> tokens = lex(line);
for(Token t : tokens) {
System.out.println(t);
}
}
//while scanner hasNext(), send the entire line to lex
/*
while(scanny.hasNext())
{
List<Token> tokens = lex(scanny.nextLine());
for(Token t : tokens) {
System.out.println(t);
}
}
*/
}
}
Line Addr Machine Code Label Assembly Code Comments
1 0000 00 halt
2 0001 01 pop
3 0002 02 dup
4 0003 03 exit
5 0004 04 ret
6 0005 0C not
7 0006 0D and
8 0007 0E or
9 0008 0F xor
10 0009 10 neg
11 000A 11 inc
12 000B 12 dec
13 000C 13 add
14 000D 14 sub
15 000E 15 mul
16 000F 16 div
17 0010 17 rem
18 0011 18 shl
19 0012 19 shr
20 0013 1A teq
21 0014 1B tne
22 0015 1C tlt
23 0016 1D tgt
24 0017 1E tle
25 0018 1F tge
26 0019 00 halt
Aucun commentaire:
Enregistrer un commentaire